|
<!DOCTYPE html> |
|
<html lang="en"> |
|
|
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Speech-to-Speech Model Comparison</title> |
|
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"> |
|
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> |
|
<style> |
|
body { |
|
background-color: #f4f6f9; |
|
font-family: 'Arial', sans-serif; |
|
} |
|
|
|
.container { |
|
background-color: white; |
|
border-radius: 10px; |
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); |
|
padding: 30px; |
|
} |
|
|
|
h3 { |
|
font-size: 1.2rem; |
|
|
|
font-weight: bold; |
|
color: #333; |
|
} |
|
|
|
.form-control { |
|
border-radius: 25px; |
|
padding: 15px; |
|
} |
|
|
|
.btn { |
|
border-radius: 25px; |
|
font-size: 0.9rem; |
|
padding: 8px 16px; |
|
transition: background-color 0.3s ease; |
|
} |
|
|
|
.btn-primary { |
|
background-color: #007bff; |
|
border: none; |
|
} |
|
|
|
.btn-primary:hover { |
|
background-color: #0056b3; |
|
} |
|
|
|
.btn-success { |
|
background-color: #28a745; |
|
border: none; |
|
} |
|
|
|
.btn-success:hover { |
|
background-color: #218838; |
|
} |
|
|
|
.btn-selected { |
|
background-color: #155724 !important; |
|
color: white !important; |
|
} |
|
|
|
.btn-option { |
|
font-size: 0.9rem; |
|
padding: 8px 20px; |
|
margin: 0 10px; |
|
} |
|
|
|
#test-content { |
|
display: none; |
|
} |
|
|
|
#category-select, |
|
#task-select-dropdown { |
|
width: 120% !important; |
|
|
|
margin: 0 auto; |
|
|
|
} |
|
|
|
#confirm-choice, |
|
#next-test { |
|
display: none; |
|
transition: opacity 0.3s ease; |
|
} |
|
|
|
#model-comparison { |
|
display: none; |
|
opacity: 0; |
|
transition: opacity 0.3s ease; |
|
} |
|
|
|
#model-comparison.show { |
|
opacity: 1; |
|
} |
|
|
|
#switch-task { |
|
font-size: 0.8rem; |
|
padding: 5px 10px; |
|
position: absolute; |
|
top: 10px; |
|
right: 20px; |
|
display: none; |
|
} |
|
#task-description { |
|
display: none; |
|
} |
|
</style> |
|
</head> |
|
|
|
<body> |
|
<div class="container py-5"> |
|
<h3 class="text-center mb-4">Speech-to-Speech Model Comparison</h3> |
|
|
|
<div id="evaluation-info" class="mb-5"> |
|
<p class="text-start"> |
|
<strong>Welcome to the Speech-to-Speech (S2S) Model Evaluation!</strong> |
|
<br><br> |
|
In this evaluation, you will assess the performance of 4 S2S models: |
|
<strong>ChatGPT-4o</strong>, <strong>FunAudioLLM</strong>, <strong>SpeechGPT</strong>, and |
|
<strong>Mini-Omni</strong>. |
|
The goal is to evaluate how well these models handle various speech tasks across different domains. |
|
<br><br> |
|
Once you select a specific domain and task (e.g., <em>Educational Tutoring</em> and <em>Rhythm Control</em>), |
|
you will proceed to the evaluation stage. In each round, you will be presented with an audio input. |
|
For example: |
|
<br><br> |
|
|
|
|
|
<span style="vertical-align: middle; line-height: 1.2; display: inline-block;"><strong>Audio Sample:</strong></span> |
|
<audio controls style="vertical-align: middle;"> |
|
<source src="/static/audio/sample/input_audio.wav" type="audio/wav"> |
|
</audio> |
|
|
|
<br><br> |
|
The corresponding text is: |
|
<em>"Say the following sentence at my speed first, then say it again very slowly: |
|
'Artificial intelligence is changing the world in many ways.'" </em> |
|
<small>(Note: the audio plays at 1.5x the normal speed.)</small> |
|
<br><br> |
|
The responses of different S2S models will be provided, and your task is to choose which response best follows |
|
the instructions. For example<small>(Note: During the evaluation process, you will be provided with responses from only the two models that have the most comparative significance.)</small>: |
|
<br><br> |
|
|
|
|
|
<span><strong>ChatGPT-4o:</strong></span> |
|
<audio controls style="vertical-align: middle;"> |
|
<source src="/static/audio/sample/4o_audio.wav" type="audio/wav"> |
|
</audio> |
|
<p class="text-start" style="margin-left: 20px;"> |
|
<strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information. |
|
</p> |
|
|
|
|
|
<span><strong>FunAudioLLM:</strong></span> |
|
<audio controls style="vertical-align: middle;"> |
|
<source src="/static/audio/sample/FunAudio_audio.wav" type="audio/wav"> |
|
</audio> |
|
<p class="text-start" style="margin-left: 20px;"> |
|
<strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information. |
|
</p> |
|
|
|
|
|
<span><strong>SpeechGPT:</strong></span> |
|
<audio controls style="vertical-align: middle;"> |
|
<source src="/static/audio/sample/SpeechGPT.wav" type="audio/wav"> |
|
</audio> |
|
<p class="text-start" style="margin-left: 20px;"> |
|
<strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Partially followed the instruction, with minor semantic deviation and missing information. |
|
</p> |
|
|
|
|
|
<span><strong>Mini-Omni:</strong></span> |
|
<audio controls style="vertical-align: middle;"> |
|
<source src="/static/audio/sample/mini-omni.wav" type="audio/wav"> |
|
</audio> |
|
<p class="text-start" style="margin-left: 20px;"> |
|
<strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Did not follow the instruction, with significant semantic deviation and missing information. |
|
</p> |
|
|
|
<p class="text-start"> |
|
After making your choice, you'll proceed to the next round. |
|
</p> |
|
<strong>Please enter your username and start the evaluation!</strong> |
|
</p> |
|
</div> |
|
|
|
<div id="user-input" class="text-center"> |
|
<div class="mb-3"> |
|
<input type="text" id="username" class="form-control w-50 mx-auto" placeholder="Your username" /> |
|
</div> |
|
<button class="btn btn-primary" onclick="startTest()">Start Test</button> |
|
</div> |
|
|
|
|
|
<div id="task-select" class="text-center" style="display: none;"> |
|
<h3 class="my-4">Select Test Category:</h3> |
|
<div class="d-grid gap-2 col-6 mx-auto"> |
|
|
|
<select id="category-select" class="form-select mx-auto" onchange="populateTasks()"> |
|
<option value="" disabled selected>Select Category</option> |
|
<option value="educational">Educational Tutoring</option> |
|
<option value="social">Social Companionship</option> |
|
<option value="entertainment">Entertainment Dubbing</option> |
|
<option value="medical">Medical Consultation</option> |
|
</select> |
|
</div> |
|
|
|
<h3 class="my-4" id="specific-task-title" style="display: none;">Select Specific Task:</h3> |
|
<div class="d-grid gap-2 col-6 mx-auto"> |
|
|
|
<select id="task-select-dropdown" class="form-select mx-auto" style="display: none;"> |
|
<option value="" disabled selected>Select Specific Task</option> |
|
|
|
</select> |
|
</div> |
|
|
|
<button class="btn btn-primary mt-4" id="start-task-btn" onclick="selectTaskFromDropdown()" |
|
style="display: none;">Start Task</button> |
|
</div> |
|
|
|
<button id="switch-task" class="btn btn-warning" onclick="switchTask()">Switch Category and Tasks</button> |
|
|
|
<div id="test-content"> |
|
<div class="text-center"> |
|
|
|
<div class="row justify-content-center"> |
|
<div class="col-md-6 text-start double-text" style="margin-bottom: 10px;"> |
|
<strong>Task description:</strong> <span id="task-description"></span> |
|
</div> |
|
</div> |
|
|
|
|
|
<div class="row justify-content-center"> |
|
<div class="col-md-6 d-flex justify-content-center align-items-center mb-4"> |
|
<strong class="me-2">Audio:</strong> |
|
<audio id="input-audio" controls></audio> |
|
</div> |
|
</div> |
|
|
|
<div class="row justify-content-center"> |
|
<div class="col-md-6 text-start double-text" style="margin-bottom: 10px;"> |
|
<strong>Audio text:</strong> <span id="test-text"></span> |
|
</div> |
|
</div> |
|
|
|
|
|
<div class="row justify-content-center"> |
|
<div class="col-md-6 text-start"> |
|
<p><strong>Question:</strong> Which of the following two models answers the result better?</p> |
|
</div> |
|
</div> |
|
|
|
|
|
<div class="mb-4 text-center"> |
|
<div class="model-section d-flex align-items-center justify-content-center mb-3"> |
|
<h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model A:</h6> |
|
<audio id="audio-a" controls></audio> |
|
</div> |
|
<div class="model-section d-flex align-items-center justify-content-center"> |
|
<h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model B:</h6> |
|
<audio id="audio-b" controls></audio> |
|
</div> |
|
</div> |
|
|
|
|
|
<div class="d-flex justify-content-center mt-4"> |
|
<button class="btn btn-success btn-option mx-2" onclick="selectModel('A')">Model A</button> |
|
<button class="btn btn-success btn-option mx-2" onclick="selectModel('B')">Model B</button> |
|
</div> |
|
|
|
<div id="model-comparison" class="text-center mt-4"> |
|
<p>Model A: <span id="model-a"></span></p> |
|
<p>Model B: <span id="model-b"></span></p> |
|
<p>Your choice: <span id="chosen-model"></span></p> |
|
</div> |
|
|
|
<button id="confirm-choice" class="btn btn-primary mt-4" onclick="confirmChoice()">Confirm |
|
Selection</button> |
|
<button id="next-test" class="btn btn-primary mt-4" onclick="loadNextTest()">Next Test</button> |
|
</div> |
|
</div> |
|
|
|
<div id="test-completed" class="text-center" style="display: none;"> |
|
<h3>Thank you for completing the <span id="completed-task"></span> test!</h3> |
|
<p>Would you like to test another category or task?</p> |
|
<button class="btn btn-primary" onclick="switchTask()">Yes</button> |
|
<button class="btn btn-secondary" onclick="endTest()">No</button> |
|
</div> |
|
|
|
|
|
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script> |
|
<script> |
|
let username; |
|
let task; |
|
let chosenModel; |
|
let modelA, modelB; |
|
|
|
|
|
const modelNames = { |
|
"output_path_speechgpt": "SpeechGPT", |
|
"output_path_miniomni": "Mini-Omni", |
|
"output_path_4o": "ChatGPT-4o", |
|
"output_path_funaudio": "FunAudioLLM", |
|
"output_path_4o_cascade": "Cascade", |
|
"output_path_4o_llama_omni": "LLaMA-Omni" |
|
}; |
|
|
|
function startTest() { |
|
username = $("#username").val(); |
|
if (!username) { |
|
alert("Please enter a username"); |
|
return; |
|
} |
|
$("#evaluation-info").hide(); |
|
$("#user-input").hide(); |
|
$("#task-select").show(); |
|
} |
|
|
|
function switchTask() { |
|
|
|
$("#task-description").text(''); |
|
$("#test-content").hide(); |
|
$("#test-text").text(''); |
|
$("#input-audio").attr("src", ''); |
|
$("#audio-a").attr("src", ''); |
|
$("#audio-b").attr("src", ''); |
|
$("#chosen-model").text(''); |
|
$("#model-a").text(''); |
|
$("#model-b").text(''); |
|
$("#confirm-choice").hide(); |
|
$("#next-test").hide(); |
|
$("#model-comparison").removeClass('show').hide(); |
|
|
|
|
|
$("#test-completed").hide(); |
|
$("#task-select").show(); |
|
$("#switch-task").hide(); |
|
} |
|
|
|
function selectTask(selectedTask) { |
|
task = selectedTask; |
|
|
|
|
|
$("#task-description").text(''); |
|
$("#test-text").text(''); |
|
$("#input-audio").attr("src", ''); |
|
$("#audio-a").attr("src", ''); |
|
$("#audio-b").attr("src", ''); |
|
$("#chosen-model").text(''); |
|
$("#model-a").text(''); |
|
$("#model-b").text(''); |
|
$("#confirm-choice").hide(); |
|
$("#next-test").hide(); |
|
$("#model-comparison").removeClass('show').hide(); |
|
|
|
|
|
$("#task-select").hide(); |
|
$("#switch-task").show(); |
|
|
|
|
|
$.ajax({ |
|
url: '/start_test', |
|
type: 'POST', |
|
contentType: 'application/json', |
|
data: JSON.stringify({ username: username, task: task }), |
|
success: function (data) { |
|
$("#test-content").show(); |
|
loadNextTest(); |
|
}, |
|
error: function (xhr, status, error) { |
|
console.error("Error occurred: ", status, error); |
|
} |
|
}); |
|
} |
|
|
|
function populateTasks() { |
|
const category = $("#category-select").val(); |
|
const taskDropdown = $("#task-select-dropdown"); |
|
|
|
|
|
taskDropdown.empty(); |
|
|
|
taskDropdown.append('<option value="" disabled selected>Select Specific Task</option>'); |
|
|
|
|
|
if (category === 'educational') { |
|
taskDropdown.append('<option value="pronunciation">Correcting pronunciation ability</option>'); |
|
taskDropdown.append('<option value="rhythm">Rhythm control capabilities</option>'); |
|
taskDropdown.append('<option value="translation">Cross-language translation with emotion</option>'); |
|
taskDropdown.append('<option value="language">Language consistency</option>'); |
|
taskDropdown.append('<option value="pause">Pause and segmentation</option>'); |
|
taskDropdown.append('<option value="polyphone">Polyphonic word comprehension</option>'); |
|
taskDropdown.append('<option value="stress">Emphasis control</option>'); |
|
} else if (category === 'social') { |
|
taskDropdown.append('<option value="emotion">Emotion recognition and expression</option>'); |
|
taskDropdown.append('<option value="identity">Identity coping ability</option>'); |
|
taskDropdown.append('<option value="humor">Implications ability</option>'); |
|
taskDropdown.append('<option value="irony">Sarcasm detection</option>'); |
|
} else if (category === 'entertainment') { |
|
taskDropdown.append('<option value="natural">Ability to simulate natural sound</option>'); |
|
taskDropdown.append('<option value="singing">Singing ability</option>'); |
|
taskDropdown.append('<option value="tongue">Tongue twisters capabilities</option>'); |
|
taskDropdown.append('<option value="crosstalk">Crosstalk ability</option>'); |
|
taskDropdown.append('<option value="poetry">Poetry recitation</option>'); |
|
taskDropdown.append('<option value="role">Role-playing</option>'); |
|
taskDropdown.append('<option value="story">Storytelling</option>'); |
|
} else if (category === 'medical') { |
|
taskDropdown.append('<option value="healthcare">Health consultation</option>'); |
|
taskDropdown.append('<option value="illness">Querying symptoms</option>'); |
|
taskDropdown.append('<option value="psychological">Psychological comfort</option>'); |
|
} |
|
|
|
|
|
if (category) { |
|
$("#specific-task-title").show(); |
|
$("#task-select-dropdown").show(); |
|
$("#start-task-btn").show(); |
|
} else { |
|
$("#specific-task-title").hide(); |
|
$("#task-select-dropdown").hide(); |
|
$("#start-task-btn").hide(); |
|
} |
|
} |
|
|
|
|
|
function selectTaskFromDropdown() { |
|
const selectedTask = $("#task-select-dropdown").val(); |
|
if (selectedTask) { |
|
task = selectedTask; |
|
$.ajax({ |
|
url: '/start_test', |
|
type: 'POST', |
|
contentType: 'application/json', |
|
data: JSON.stringify({ username: username, task: task }), |
|
success: function (data) { |
|
|
|
$("#task-description").text(data.task_description); |
|
$("#task-description").show(); |
|
$("#task-select").hide(); |
|
$("#test-content").show(); |
|
$("#switch-task").show(); |
|
loadNextTest(); |
|
}, |
|
error: function (xhr, status, error) { |
|
console.error("Error occurred: ", status, error); |
|
} |
|
}); |
|
} else { |
|
alert("Please select a specific task."); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
function loadNextTest() { |
|
$.get('/next_test', function (data) { |
|
if (data.message === 'Test completed') { |
|
$("#test-content").hide(); |
|
$("#test-completed").show(); |
|
|
|
|
|
$("#completed-task").text(task); |
|
|
|
|
|
sessionStorage.removeItem('current_index'); |
|
} else { |
|
|
|
console.log(data); |
|
$("#task-description").text(data.task_description); |
|
$("#test-text").text(data.text); |
|
$("#input-audio").attr("src", data.input_path); |
|
$("#audio-a").attr("src", data.audio_a); |
|
$("#audio-b").attr("src", data.audio_b); |
|
|
|
|
|
modelA = modelNames[data.model_a]; |
|
modelB = modelNames[data.model_b]; |
|
$("#model-a").text(modelA); |
|
$("#model-b").text(modelB); |
|
|
|
$("#next-test").hide(); |
|
$("#model-comparison").hide(); |
|
$("#confirm-choice").show(); |
|
chosenModel = null; |
|
$(".btn-option").prop('disabled', false); |
|
$(".btn-option").removeClass("btn-selected").addClass("btn-success"); |
|
} |
|
}, 'json').fail(function (xhr, status, error) { |
|
console.error("Failed to load test data:", status, error); |
|
}); |
|
} |
|
|
|
function endTest() { |
|
|
|
alert("Thank you for participating in the test!"); |
|
|
|
window.location.href = "/thank_you"; |
|
} |
|
|
|
function selectModel(model) { |
|
|
|
chosenModel = model; |
|
|
|
|
|
$(".btn-option").prop('disabled', false); |
|
|
|
|
|
$(".btn-option").removeClass("btn-selected").addClass("btn-success"); |
|
|
|
|
|
if (model === 'A') { |
|
$("button:contains('Model A')").removeClass("btn-success").addClass("btn-selected"); |
|
} else if (model === 'B') { |
|
$("button:contains('Model B')").removeClass("btn-success").addClass("btn-selected"); |
|
} |
|
} |
|
|
|
function confirmChoice() { |
|
|
|
if (!chosenModel) { |
|
alert("Please select a model before confirming."); |
|
return; |
|
} |
|
|
|
|
|
$(".btn-option").prop('disabled', true); |
|
|
|
|
|
if (chosenModel === 'A') { |
|
$("#chosen-model").text(modelA); |
|
} else { |
|
$("#chosen-model").text(modelB); |
|
} |
|
|
|
|
|
$("#model-a").text(modelA); |
|
$("#model-b").text(modelB); |
|
|
|
|
|
$("#model-comparison").addClass('show'); |
|
$("#model-comparison").show(); |
|
|
|
|
|
$("#confirm-choice").hide(); |
|
$("#next-test").show(); |
|
|
|
|
|
$.ajax({ |
|
url: '/submit_result', |
|
type: 'POST', |
|
contentType: 'application/json', |
|
data: JSON.stringify({ chosen_model: chosenModel }), |
|
success: function (data) { |
|
|
|
}, |
|
error: function (xhr, status, error) { |
|
console.error("Error occurred: ", status, error); |
|
} |
|
}); |
|
} |
|
</script> |
|
</body> |
|
|
|
</html> |
|
|