Spaces:
Runtime error
Runtime error
File size: 6,015 Bytes
9e41260 4eca143 9e41260 4eca143 9e41260 4eca143 9e41260 4eca143 9e41260 4eca143 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 c54507d 95d33e2 9e41260 95d33e2 9e41260 95d33e2 9e41260 95d33e2 9e41260 c4d5b0c 95d33e2 c54507d 4eca143 9e41260 4eca143 9e41260 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import TextClip, concatenate_videoclips, CompositeVideoClip, AudioFileClip, ImageClip
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color
# 获取所有可用的语音
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# 文字转语音功能
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter the text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, None
# 文字转视频功能
def text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
# 字体文件路径
font_path = os.path.abspath(text_font)
# 计算每页可以容纳的行数和每行可以容纳的字符数
max_chars_per_line = video_width // (text_size // 2) # 字体宽度假设为字体大小的一半
max_lines_per_page = video_height // (text_size + 15) # 10是行间距
# 按页拆分文本
words = text.split()
lines = []
current_line = ""
pages = []
for word in words:
if len(current_line) + len(word) + 1 > max_chars_per_line:
lines.append(current_line)
current_line = word
if len(lines) == max_lines_per_page:
pages.append("\n".join(lines))
lines = []
else:
current_line = f"{current_line} {word}".strip()
lines.append(current_line)
if lines:
pages.append("\n".join(lines))
# 为每页生成独立音频
audio_clips = []
video_clips = []
for i, page in enumerate(pages):
# 将每页的文本连贯朗读生成一个音频文件
audio_text = page.replace("\n", " ") # 移除换行符以防止 TTS 停顿
audio, warning = asyncio.run(text_to_speech(audio_text, voice, rate, pitch))
if warning:
return None, warning
audio_clip = AudioFileClip(audio)
audio_clips.append(audio_clip)
# 使用 wand 生成视频片段
with Drawing() as draw:
draw.font = font_path
draw.font_size = text_size
draw.fill_color = Color(text_color)
draw.text_alignment = 'center'
draw.text_interline_spacing = 10
with Image(width=video_width, height=video_height, background=Color(bg_color)) as img:
lines = page.split("\n")
# Centering text vertically
total_text_height = len(lines) * (text_size + 10) # Height of text area
start_y = (video_height - total_text_height) // 2 # Start position to center vertically
for j, line in enumerate(lines):
draw.text(int(video_width / 2), start_y + (j * (text_size + 10)), line)
draw(img) # Apply the drawing to the image
img.format = 'png'
img_path = os.path.join(tempfile.gettempdir(), f"page_{i}.png")
img.save(filename=img_path)
text_clip = ImageClip(img_path).set_duration(audio_clip.duration).set_audio(audio_clip)
video_clips.append(text_clip)
# 合并所有视频片段
final_video = concatenate_videoclips(video_clips)
final_video_path = os.path.join(tempfile.gettempdir(), "output_video.mp4")
final_video.write_videofile(final_video_path, fps=24, codec="libx264")
return final_video_path, None
# Gradio接口函数
def tts_interface(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
video, warning = text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size)
return None, video, warning
# 创建Gradio应用
async def create_demo():
voices = await get_voices()
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
gr.Slider(minimum=640, maximum=1920, value=1080, label="Video Width", step=10),
gr.Slider(minimum=480, maximum=1080, value=720, label="Video Height", step=10),
gr.ColorPicker(value="#000000", label="Background Color"),
gr.ColorPicker(value="#FFFFFF", label="Text Color"),
gr.Textbox(label="Text Font", value="msyh.ttf"), # 请确保字体文件路径正确
gr.Slider(minimum=10, maximum=100, value=24, label="Text Size", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Video(label="Generated Video"),
gr.Markdown(label="Warning", visible=False)
],
title="Edge TTS Text to Speech and Video",
description="Convert text to speech and video using Microsoft Edge TTS. Adjust rate and pitch: 0 is the default value, positive values increase, and negative values decrease.",
analytics_enabled=False,
allow_flagging=False,
)
return demo
# 运行应用
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch()
|