import gradio as gr import edge_tts import asyncio import tempfile import os from moviepy.editor import AudioFileClip import re # Get all available voices async def get_voices(): voices = await edge_tts.list_voices() return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Text to speech function async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None, gr.Warning("Please enter the text to convert.") if not voice: return None, gr.Warning("Please select a voice.") voice_short_name = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, None # Generate SRT based on estimated timing def generate_srt(text, speech_rate, max_words_per_line): # Clean up input text text = re.sub(r'\s+', ' ', text.strip()) # Remove excessive whitespace # Split into words words = text.split() # Calculate timing for each line srt_lines = [] current_line = [] current_time = 0.0 # Start time in seconds total_words = len(words) for i, word in enumerate(words): current_line.append(word) # Calculate current line length if len(current_line) >= max_words_per_line or i == total_words - 1: # Create SRT entry line_text = ' '.join(current_line) duration = len(line_text.split()) / speech_rate # Estimate duration based on speech rate # Format timing start_time = current_time end_time = current_time + duration start_time_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}" end_time_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}" srt_lines.append(f"{len(srt_lines) + 1}\n{start_time_str} --> {end_time_str}\n{line_text}\n") # Move to the next line current_line = [] current_time += duration # Update current time return ''.join(srt_lines) # Gradio interface function def tts_interface(text, voice, rate, pitch, speech_rate, max_words_per_line): audio_path, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) if warning: return None, None, warning # Generate SRT file srt_content = generate_srt(text, speech_rate, max_words_per_line) srt_path = audio_path.replace('.mp3', '_subtitle.srt') with open(srt_path, 'w') as f: f.write(srt_content) return audio_path, srt_path, None # Create Gradio app async def create_demo(): voices = await get_voices() demo = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Input Text", lines=5), gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""), gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1), gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1), gr.Slider(minimum=100, maximum=300, value=150, label="Speech Rate (words per minute)", step=1), gr.Slider(minimum=3, maximum=8, value=5, label="Max Words per Line", step=1), ], outputs=[ gr.Audio(label="Generated Audio", type="filepath"), gr.File(label="Generated Subtitle (.srt)"), gr.Markdown(label="Warning", visible=False) ], title="Edge TTS Text to Speech with SRT", description="Convert text to speech and generate synchronized subtitles based on speech rate.", analytics_enabled=False, allow_flagging=False, ) return demo # Run the app if __name__ == "__main__": demo = asyncio.run(create_demo()) demo.launch()