Spaces:
Running
Running
File size: 7,859 Bytes
9e41260 889e1fa 18dfe86 d011be0 889e1fa 18dfe86 889e1fa f62d4ae 889e1fa 18dfe86 099a00c 889e1fa eb47e29 889e1fa f62d4ae 889e1fa 099a00c 889e1fa 18dfe86 bee386d 80226aa bee386d ad91a22 18dfe86 bee386d 18dfe86 bee386d ad91a22 c2db727 18dfe86 c2db727 ad91a22 18dfe86 bee386d c2db727 f1779f5 ad91a22 18dfe86 c2db727 12ef859 18dfe86 12ef859 18dfe86 c2db727 7bf74d5 c2db727 889e1fa 18dfe86 c2db727 bee386d c2db727 18dfe86 889e1fa 18dfe86 889e1fa 18dfe86 889e1fa b2587ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import AudioFileClip
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} ({v['Locale']}, {v['Gender']})": v['ShortName'] for v in voices}
# Text to speech functionality
async def text_to_speech(text, voice, rate, pitch, output_path):
if not text.strip():
return None, gr.Warning("Please enter the text to convert into voice")
if not voice:
return None, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" (")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
# Save to the specified output path
await communicate.save(output_path)
return output_path, None
# async def text_to_speech(text, voice, rate, pitch):
# if not text.strip():
# return None, gr.Warning("Please enter the text to convert.")
# if not voice:
# return None, gr.Warning("Please select a voice.")
# voice_short_name = voice.split(" (")[0]
# rate_str = f"{rate:+d}%"
# pitch_str = f"{pitch:+d}Hz"
# communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
# with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
# tmp_path = tmp_file.name
# await communicate.save(tmp_path)
# return tmp_path, None
# Generate SRT file with specified lines of subtitles
def generate_srt(words, audio_duration, srt_path, num_lines):
with open(srt_path, 'w', encoding='utf-8') as srt_file:
total_segments = max(len(words) // (5 * num_lines), 1) # Ensure at least one segment
segment_duration = audio_duration / total_segments
current_time = 0
for i in range(0, len(words), 5 * num_lines):
lines = []
for j in range(num_lines):
line_start = i + j * 5
line_end = line_start + 5
line = ' '.join(words[line_start:line_end])
if line:
lines.append(line)
start_time = current_time
end_time = start_time + segment_duration
start_time_str = format_srt_time(start_time)
end_time_str = format_srt_time(end_time)
srt_file.write(f"{i // (5 * num_lines) + 1}\n{start_time_str} --> {end_time_str}\n" + "\n".join(lines) + "\n\n")
current_time += segment_duration
return srt_path
# def generate_srt(words, audio_duration, srt_path, num_lines):
# with open(srt_path, 'w', encoding='utf-8') as srt_file:
# divisor = len(words) // (5 * num_lines)
# if divisor == 0:
# segment_duration = audio_duration # Use full duration as fallback
# else:
# segment_duration = audio_duration / divisor # Calculate duration per segment
# current_time = 0
# for i in range(0, len(words), 5 * num_lines):
# lines = []
# for j in range(num_lines):
# line = ' '.join(words[i + j * 5:i + (j + 1) * 5])
# if line:
# lines.append(line)
# start_time = current_time
# end_time = start_time + segment_duration
# start_time_str = format_srt_time(start_time)
# end_time_str = format_srt_time(end_time)
# srt_file.write(f"{i // (5 * num_lines) + 1}\n{start_time_str} --> {end_time_str}\n" + "\n".join(lines) + "\n\n")
# current_time += segment_duration
# return srt_path
def format_srt_time(seconds):
millis = int((seconds - int(seconds)) * 1000)
seconds = int(seconds)
minutes = seconds // 60
hours = minutes // 60
minutes %= 60
seconds %= 60
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
# Text to audio and SRT functionality
async def text_to_audio_and_srt(text, voice, rate, pitch, num_lines, output_audio_path, output_srt_path):
audio_path, warning = await text_to_speech(text, voice, rate, pitch, output_audio_path)
if warning:
return None, None, warning
audio_clip = AudioFileClip(audio_path)
audio_duration = audio_clip.duration
# Generate SRT file based on the entire text
words = text.split()
generate_srt(words, audio_duration, output_srt_path, num_lines)
return audio_path, output_srt_path, None
# async def text_to_audio_and_srt(text, voice, rate, pitch, num_lines):
# audio_path, warning = await text_to_speech(text, voice, rate, pitch)
# if warning:
# return None, None, warning
# audio_clip = AudioFileClip(audio_path)
# audio_duration = audio_clip.duration
# # Generate SRT file based on the entire text
# base_name = os.path.splitext(audio_path)[0]
# srt_path = f"{base_name}_subtitle.srt"
# words = text.split()
# generate_srt(words, audio_duration, srt_path, num_lines)
# return audio_path, srt_path, None
# Gradio interface function
def tts_interface(text, voice, rate, pitch, num_lines, output_audio_path="output_audio.mp3", output_srt_path="output_subtitle.srt"):
if not text.strip():
return None, None, gr.Warning("Text input cannot be empty.")
if num_lines <= 0:
return None, None, gr.Warning("Number of SRT lines must be greater than zero.")
try:
audio_path, srt_path, warning = asyncio.run(
text_to_audio_and_srt(text, voice, rate, pitch, num_lines, output_audio_path, output_srt_path)
)
return audio_path, srt_path, warning
except Exception as e:
return None, None, gr.Warning(f"An error occurred: {e}")
# def tts_interface(text, voice, rate, pitch, num_lines):
# audio_path, srt_path, warning = asyncio.run(text_to_audio_and_srt(text, voice, rate, pitch, num_lines))
# return audio_path, srt_path, warning
# Create Gradio app
async def create_demo():
voices = await get_voices()
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1 style="text-align: center; color: #333;">Text to Speech with Subtitles</h1>
<p style="text-align: center; color: #555;">Convert your text to natural-sounding speech and generate subtitles (SRT) for your audio.</p>
""",
elem_id="header"
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="")
rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1)
pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
num_lines_slider = gr.Slider(minimum=1, maximum=5, value=2, label="Number of SRT Lines", step=1)
generate_button = gr.Button("Generate Audio and Subtitles", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Generated Audio", type="filepath")
output_srt = gr.File(label="Generated SRT", file_count="single")
warning_msg = gr.Markdown(label="Warning", visible=False)
generate_button.click(
fn=tts_interface,
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, num_lines_slider],
outputs=[output_audio, output_srt, warning_msg]
)
return demo
# Run the app
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch(show_error=True)
|