File size: 5,401 Bytes
9e41260
889e1fa
 
 
 
d011be0
889e1fa
7bc6887
889e1fa
 
 
 
7bc6887
889e1fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8437f06
f1779f5
 
7a5c01c
 
f1779f5
7a5c01c
fbe2197
889e1fa
f1779f5
 
 
 
ff60c7c
3ae4583
7a5c01c
889e1fa
 
 
f1779f5
 
ff60c7c
7a5c01c
3ae4583
889e1fa
 
 
 
 
 
ed67ff5
889e1fa
 
 
 
7bc6887
8437f06
f1779f5
 
 
 
12ef859
 
889e1fa
12ef859
 
 
 
d011be0
 
7a5c01c
f1779f5
 
7a5c01c
f1779f5
7bf74d5
12ef859
889e1fa
7bc6887
8437f06
 
889e1fa
 
7bc6887
889e1fa
 
 
d011be0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8437f06
fbe2197
 
fef1314
 
d011be0
 
fef1314
 
 
d011be0
 
 
8437f06
d011be0
 
fef1314
889e1fa
 
7bc6887
889e1fa
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import AudioFileClip

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text to speech functionality
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# Generate SRT file based on user preferences
def generate_srt(words, audio_duration, srt_path, words_per_line, lines_per_paragraph):
    total_words = len(words)
    
    # Calculate how long each segment will be displayed
    segment_duration = audio_duration / (total_words // words_per_line // lines_per_paragraph)  # Calculate duration based on total segments
    
    current_time = 0
    with open(srt_path, 'w', encoding='utf-8') as srt_file:
        for i in range(0, total_words, words_per_line):
            # Gather lines based on the defined words per line
            lines = words[i:i + words_per_line]
            line_text = ' '.join(lines)

            start_time = current_time
            end_time = min(start_time + segment_duration, audio_duration)  # Ensure it doesn't exceed audio duration
            
            start_time_str = format_srt_time(start_time)
            end_time_str = format_srt_time(end_time)
            srt_file.write(f"{(i // words_per_line) + 1}\n{start_time_str} --> {end_time_str}\n")
            srt_file.write(f"{line_text}\n\n")

            current_time += segment_duration  # Update current time for the next segment

    return srt_path

def format_srt_time(seconds):
    millis = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    minutes = seconds // 60
    hours = seconds // 3600
    minutes %= 60
    seconds %= 60
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

# Text to audio and SRT functionality
async def text_to_audio_and_srt(text, voice, rate, pitch, words_per_line, lines_per_paragraph):
    # Clean up input text: remove extra spaces and newlines
    cleaned_text = ' '.join(text.split())

    audio_path, warning = await text_to_speech(cleaned_text, voice, rate, pitch)
    if warning:
        return None, None, warning

    audio_clip = AudioFileClip(audio_path)
    audio_duration = audio_clip.duration
    
    # Generate SRT file based on the entire text
    base_name = os.path.splitext(audio_path)[0]
    srt_path = f"{base_name}_subtitle.srt"
    
    # Split input text into words
    words = cleaned_text.split()
    
    generate_srt(words, audio_duration, srt_path, words_per_line, lines_per_paragraph)

    return audio_path, srt_path, None

# Gradio interface function
def tts_interface(text, voice, rate, pitch, words_per_line, lines_per_paragraph):
    audio_path, srt_path, warning = asyncio.run(text_to_audio_and_srt(text, voice, rate, pitch, words_per_line, lines_per_paragraph))
    return audio_path, srt_path, warning

# Create Gradio app
async def create_demo():
    voices = await get_voices()
    
    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 style="text-align: center; color: #333;">Text to Speech with Subtitles</h1>
            <p style="text-align: center; color: #555;">Convert your text to natural-sounding speech and generate subtitles (SRT) for your audio.</p>
            """, 
            elem_id="header"
        )

        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
                voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="")
                rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1)
                pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)

                words_per_line = gr.Slider(minimum=3, maximum=8, value=5, label="Words per Line", step=1)
                lines_per_paragraph = gr.Slider(minimum=1, maximum=5, value=2, label="Lines per Paragraph", step=1)
                
                generate_button = gr.Button("Generate Audio and Subtitles", variant="primary")

            with gr.Column():
                output_audio = gr.Audio(label="Generated Audio", type="filepath")
                output_srt = gr.File(label="Generated SRT", file_count="single")
                warning_msg = gr.Markdown(label="Warning", visible=False)

        generate_button.click(
            fn=tts_interface,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, words_per_line, lines_per_paragraph],
            outputs=[output_audio, output_srt, warning_msg]
        )

    return demo

# Run the app
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()