File size: 6,015 Bytes
9e41260
 
4eca143
9e41260
 
 
4eca143
 
9e41260
 
 
 
 
 
4eca143
9e41260
4eca143
9e41260
 
 
 
 
 
 
 
 
 
 
 
 
4eca143
95d33e2
c54507d
95d33e2
 
c54507d
95d33e2
 
 
 
 
c54507d
 
 
95d33e2
c54507d
 
95d33e2
c54507d
 
95d33e2
 
 
c54507d
 
 
95d33e2
 
 
c54507d
95d33e2
c54507d
 
95d33e2
 
 
 
c54507d
 
 
 
 
95d33e2
9e41260
 
 
 
 
 
 
 
95d33e2
9e41260
 
 
95d33e2
9e41260
 
95d33e2
9e41260
 
 
 
 
 
c4d5b0c
95d33e2
c54507d
 
 
 
4eca143
9e41260
 
 
 
4eca143
9e41260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import TextClip, concatenate_videoclips, CompositeVideoClip, AudioFileClip, ImageClip
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color

# 获取所有可用的语音
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# 文字转语音功能
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# 文字转视频功能
def text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
    # 字体文件路径
    font_path = os.path.abspath(text_font)

    # 计算每页可以容纳的行数和每行可以容纳的字符数
    max_chars_per_line = video_width // (text_size // 2)  # 字体宽度假设为字体大小的一半
    max_lines_per_page = video_height // (text_size + 15)  # 10是行间距

    # 按页拆分文本
    words = text.split()
    lines = []
    current_line = ""
    pages = []
    
    for word in words:
        if len(current_line) + len(word) + 1 > max_chars_per_line:
            lines.append(current_line)
            current_line = word
            if len(lines) == max_lines_per_page:
                pages.append("\n".join(lines))
                lines = []
        else:
            current_line = f"{current_line} {word}".strip()
    
    lines.append(current_line)
    if lines:
        pages.append("\n".join(lines))

    # 为每页生成独立音频
    audio_clips = []
    video_clips = []
    for i, page in enumerate(pages):
        # 将每页的文本连贯朗读生成一个音频文件
        audio_text = page.replace("\n", " ")  # 移除换行符以防止 TTS 停顿
        audio, warning = asyncio.run(text_to_speech(audio_text, voice, rate, pitch))
        if warning:
            return None, warning
        audio_clip = AudioFileClip(audio)
        audio_clips.append(audio_clip)

        # 使用 wand 生成视频片段
        with Drawing() as draw:
            draw.font = font_path
            draw.font_size = text_size
            draw.fill_color = Color(text_color)
            draw.text_alignment = 'center'
            draw.text_interline_spacing = 10
            
            with Image(width=video_width, height=video_height, background=Color(bg_color)) as img:
                lines = page.split("\n")
                # Centering text vertically
                total_text_height = len(lines) * (text_size + 10)  # Height of text area
                start_y = (video_height - total_text_height) // 2  # Start position to center vertically
                
                for j, line in enumerate(lines):
                    draw.text(int(video_width / 2), start_y + (j * (text_size + 10)), line)
                
                draw(img)  # Apply the drawing to the image
                img.format = 'png'
                img_path = os.path.join(tempfile.gettempdir(), f"page_{i}.png")
                img.save(filename=img_path)
                text_clip = ImageClip(img_path).set_duration(audio_clip.duration).set_audio(audio_clip)
                video_clips.append(text_clip)

    # 合并所有视频片段
    final_video = concatenate_videoclips(video_clips)
    final_video_path = os.path.join(tempfile.gettempdir(), "output_video.mp4")
    final_video.write_videofile(final_video_path, fps=24, codec="libx264")
    return final_video_path, None

# Gradio接口函数
def tts_interface(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
    video, warning = text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size)
    return None, video, warning

# 创建Gradio应用
async def create_demo():
    voices = await get_voices()
    
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
            gr.Slider(minimum=640, maximum=1920, value=1080, label="Video Width", step=10),
            gr.Slider(minimum=480, maximum=1080, value=720, label="Video Height", step=10),
            gr.ColorPicker(value="#000000", label="Background Color"),
            gr.ColorPicker(value="#FFFFFF", label="Text Color"),
            gr.Textbox(label="Text Font", value="msyh.ttf"),  # 请确保字体文件路径正确
            gr.Slider(minimum=10, maximum=100, value=24, label="Text Size", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Video(label="Generated Video"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS Text to Speech and Video",
        description="Convert text to speech and video using Microsoft Edge TTS. Adjust rate and pitch: 0 is the default value, positive values increase, and negative values decrease.",
        analytics_enabled=False,
        allow_flagging=False,
    )
    
    return demo

# 运行应用
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()