Spaces:

hivecorp
/

master-tts-pro

Sleeping

App Files Files Community

hivecorp commited on Nov 3, 2024

Commit

077e0e7

verified ·

1 Parent(s): 85eaa57

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -85

app.py CHANGED Viewed

@@ -1,92 +1,108 @@
-import gradio as gr
-from pydub import AudioSegment
 import edge_tts
 import os
-import asyncio
-# Function to get the length of an audio file in seconds
-def get_audio_length(audio_file):
-    audio = AudioSegment.from_file(audio_file)
-    return audio.duration_seconds
-# Function to format time for SRT
-def format_time(seconds):
-    millis = int((seconds % 1) * 1000)
-    seconds = int(seconds)
-    hrs = seconds // 3600
-    mins = (seconds % 3600) // 60
-    secs = seconds % 60
-    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
-# Function to generate SRT with accurate timing per batch
-async def generate_accurate_srt(batch_text, batch_num, start_offset):
-    audio_file = f"batch_{batch_num}_audio.wav"
-    # Generate the audio using edge-tts
-    tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
-    await tts.save(audio_file)
-    # Get the actual length of the audio file
-    actual_length = get_audio_length(audio_file)
-    # Initialize SRT content
-    srt_content = ""
-    words = batch_text.split()
-    segment_duration = actual_length / len(words) * 10  # Adjusted for ~10 words per SRT segment
-    start_time = start_offset
-    # Build SRT content with accurate timing
-    for i in range(0, len(words), 10):
-        segment_words = words[i:i+10]
-        end_time = start_time + segment_duration
-        srt_content += f"{i // 10 + 1 + (batch_num * 100)}\n"
-        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
-        srt_content += " ".join(segment_words) + "\n\n"
-        start_time = end_time
-    return srt_content, audio_file, start_time
-# Batch processing function for SRT and audio generation
-async def batch_process_srt_and_audio(script_text):
-    batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
-    all_srt_content = ""
-    combined_audio = AudioSegment.empty()
-    start_offset = 0.0  # Track cumulative time offset for SRT timing
-    for batch_num, batch_text in enumerate(batches):
-        srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset)
-        all_srt_content += srt_content
-        # Append the audio of each batch to the combined audio
-        batch_audio = AudioSegment.from_file(audio_file)
-        combined_audio += batch_audio
-        start_offset = end_offset  # Update the start offset for the next batch
-        # Clean up the individual batch audio file
-        os.remove(audio_file)
-    # Export combined audio and SRT
-    combined_audio.export("final_audio.wav", format="wav")
-    with open("final_subtitles.srt", "w") as srt_file:
-        srt_file.write(all_srt_content)
-    return "final_subtitles.srt", "final_audio.wav"
-# Gradio interface function
-async def process_script(script_text):
-    srt_path, audio_path = await batch_process_srt_and_audio(script_text)
-    return srt_path, audio_path, audio_path
-# Gradio interface setup
-app = gr.Interface(
-    fn=process_script,
-    inputs=gr.Textbox(label="Enter Script Text", lines=10),
-    outputs=[
-        gr.File(label="Download SRT File"),
-        gr.File(label="Download Audio File"),
-        gr.Audio(label="Play Audio")
-    ],
-    description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
-)
-app.launch()

 import edge_tts
+import srt
 import os
+import wave
+import gradio as gr
+# Function to calculate audio duration for a given audio file
+def get_audio_length(audio_path):
+    with wave.open(audio_path, 'rb') as audio:
+        frames = audio.getnframes()
+        rate = audio.getframerate()
+        return frames / float(rate)
+# Function to generate SRT entries for a batch of text with accurate timing
+def generate_accurate_srt(text, start_time, batch_index):
+    srt_entries = []
+    current_time = start_time
+    for line in text.splitlines():
+        # Estimate duration of each line based on audio segment generated
+        duration = len(line.split()) * 0.3  # Assuming approx. 0.3 seconds per word
+        end_time = current_time + duration
+        srt_entries.append(
+            srt.Subtitle(
+                index=batch_index,
+                start=srt.timedelta(seconds=current_time),
+                end=srt.timedelta(seconds=end_time),
+                content=line
+            )
+        )
+        current_time = end_time
+        batch_index += 1
+    return srt_entries, current_time
+# Process each batch of text, generate audio, and accumulate SRT entries
+def batch_process_srt_and_audio(script_text, batch_size=500):
+    total_srt_entries = []
+    cumulative_time = 0.0
+    batch_index = 1
+    for i in range(0, len(script_text), batch_size):
+        batch_text = script_text[i:i+batch_size]
+        # Generate audio for the batch
+        audio_file = f"audio_batch_{i}.wav"
+        communicate = edge_tts.Communicate(text=batch_text, voice="en-US-AndrewNeural", rate="-25%")
+        communicate.save(audio_file)
+        # Get the duration of the generated audio batch
+        batch_duration = get_audio_length(audio_file)
+        # Generate SRT entries for this batch and update cumulative time
+        srt_entries, cumulative_time = generate_accurate_srt(batch_text, cumulative_time, batch_index)
+        total_srt_entries.extend(srt_entries)
+        batch_index += len(srt_entries)
+    # Write the SRT file
+    srt_file = "output.srt"
+    with open(srt_file, 'w') as file:
+        file.write(srt.compose(total_srt_entries))
+    return srt_file
+# Final validation to ensure no SRT entry extends beyond total audio duration
+def validate_srt_against_audio(srt_file_path, audio_file_path):
+    audio_duration = get_audio_length(audio_file_path)
+    with open(srt_file_path, 'r') as file:
+        subtitles = list(srt.parse(file.read()))
+    for subtitle in subtitles:
+        if subtitle.end.total_seconds() > audio_duration:
+            subtitle.end = srt.timedelta(seconds=audio_duration)
+            break
+    # Write the validated SRT back to the file
+    with open(srt_file_path, 'w') as file:
+        file.write(srt.compose(subtitles))
+    return srt_file_path
+# Gradio Interface
+def process_text_to_srt(script_text):
+    # Process the script in batches and create SRT
+    srt_file = batch_process_srt_and_audio(script_text)
+    # Validate the final SRT file with the complete audio file
+    final_audio_file = "combined_audio.wav"  # Assumes you have a combined final audio file
+    validate_srt_against_audio(srt_file, final_audio_file)
+    return srt_file, final_audio_file
+# Gradio app setup
+def main():
+    gr.Interface(
+        fn=process_text_to_srt,
+        inputs="textbox",
+        outputs=["file", "audio"],
+        live=True,
+        title="Text-to-SRT with Accurate Timing",
+        description="Enter text to convert it into audio with synchronized SRT subtitles. The SRT timings are validated against the total audio duration."
+    ).launch()
+# Run the app
+if __name__ == "__main__":
+    main()