File size: 4,219 Bytes
9e41260
889e1fa
 
 
 
d011be0
2b5d6f0
889e1fa
7bc6887
889e1fa
 
 
 
2b5d6f0
889e1fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b5d6f0
 
 
 
ff60c7c
2b5d6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae4583
2b5d6f0
889e1fa
2b5d6f0
 
 
 
 
 
 
 
889e1fa
2b5d6f0
f1779f5
2b5d6f0
 
 
12ef859
 
 
2b5d6f0
 
 
7a5c01c
2b5d6f0
 
7bf74d5
12ef859
889e1fa
7bc6887
889e1fa
 
 
2b5d6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889e1fa
 
7bc6887
889e1fa
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import AudioFileClip
import re

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text to speech function
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# Generate SRT based on estimated timing
def generate_srt(text, speech_rate, max_words_per_line):
    # Clean up input text
    text = re.sub(r'\s+', ' ', text.strip())  # Remove excessive whitespace

    # Split into words
    words = text.split()
    
    # Calculate timing for each line
    srt_lines = []
    current_line = []
    current_time = 0.0  # Start time in seconds
    total_words = len(words)

    for i, word in enumerate(words):
        current_line.append(word)
        
        # Calculate current line length
        if len(current_line) >= max_words_per_line or i == total_words - 1:
            # Create SRT entry
            line_text = ' '.join(current_line)
            duration = len(line_text.split()) / speech_rate  # Estimate duration based on speech rate
            
            # Format timing
            start_time = current_time
            end_time = current_time + duration
            
            start_time_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
            end_time_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"
            
            srt_lines.append(f"{len(srt_lines) + 1}\n{start_time_str} --> {end_time_str}\n{line_text}\n")
            
            # Move to the next line
            current_line = []
            current_time += duration  # Update current time

    return ''.join(srt_lines)

# Gradio interface function
def tts_interface(text, voice, rate, pitch, speech_rate, max_words_per_line):
    audio_path, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    if warning:
        return None, None, warning
    
    # Generate SRT file
    srt_content = generate_srt(text, speech_rate, max_words_per_line)
    srt_path = audio_path.replace('.mp3', '_subtitle.srt')
    
    with open(srt_path, 'w') as f:
        f.write(srt_content)

    return audio_path, srt_path, None

# Create Gradio app
async def create_demo():
    voices = await get_voices()
    
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
            gr.Slider(minimum=100, maximum=300, value=150, label="Speech Rate (words per minute)", step=1),
            gr.Slider(minimum=3, maximum=8, value=5, label="Max Words per Line", step=1),
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Generated Subtitle (.srt)"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS Text to Speech with SRT",
        description="Convert text to speech and generate synchronized subtitles based on speech rate.",
        analytics_enabled=False,
        allow_flagging=False,
    )
    
    return demo

# Run the app
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()