Spaces:

hivecorp
/

master-tts-pro

Sleeping

App Files Files Community

hivecorp commited on Nov 13, 2024

Commit

f5e4024

verified ·

1 Parent(s): f826da3

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -71

app.py CHANGED Viewed

@@ -6,37 +6,30 @@ import asyncio
 import uuid
 import re
-# Function to get the length of an audio file in seconds
 def get_audio_length(audio_file):
     audio = AudioSegment.from_file(audio_file)
-    return audio.duration_seconds
-# Function to format time for SRT
-def format_time(seconds):
-    millis = int((seconds % 1) * 1000)
-    seconds = int(seconds)
-    hrs = seconds // 3600
-    mins = (seconds % 3600) // 60
-    secs = seconds % 60
-    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
-# Function to split text based on punctuation, handling segments over 8 words
 def split_text_into_segments(text):
-    # Split based on punctuation marks (.!? and ,)
     segments = []
     raw_segments = re.split(r'([.!?,])', text)
-    temp_sentence = ""
     for i in range(0, len(raw_segments) - 1, 2):
-        # Combine sentence with punctuation
         sentence = raw_segments[i].strip() + raw_segments[i + 1]
         words = sentence.split()
-        # If the sentence has 8 words or fewer, add as is
         if len(words) <= 8:
             segments.append(sentence.strip())
         else:
-            # Split longer sentences into chunks of max 8 words without splitting words
             chunk = ""
             for word in words:
                 if len(chunk.split()) < 8:
@@ -47,7 +40,6 @@ def split_text_into_segments(text):
             if chunk:
                 segments.append(chunk.strip())
-    # Handle any leftover sentence fragment not followed by punctuation
     if len(raw_segments) % 2 == 1:
         remaining_text = raw_segments[-1].strip()
         if remaining_text:
@@ -55,23 +47,19 @@ def split_text_into_segments(text):
     return segments
-# Function to generate SRT with accurate timing per batch
 async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
     audio_file = f"batch_{batch_num}_audio.wav"
-    # Generate the audio using edge-tts
     tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
     await tts.save(audio_file)
-    # Get the actual length of the audio file
-    actual_length = get_audio_length(audio_file)
-    # Split the text into segments based on punctuation and word count
     segments = split_text_into_segments(batch_text)
-    segment_duration = actual_length / len(segments)  # Duration per segment
     start_time = start_offset
-    # Initialize SRT content
     srt_content = ""
     for index, segment in enumerate(segments):
         end_time = start_time + segment_duration
@@ -80,14 +68,14 @@ async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate
             end_time = start_offset + actual_length
         srt_content += f"{index + 1 + (batch_num * 100)}\n"
-        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
         srt_content += segment + "\n\n"
         start_time = end_time
     return srt_content, audio_file, start_time
-# Batch processing function
 async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
     batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
     all_srt_content = ""
@@ -114,7 +102,7 @@ async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=
             end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
             if end_time > total_audio_length:
                 end_time = total_audio_length
-            line = f"{format_time(start_time)} --> {format_time(end_time)}"
         validated_srt_content += line + "\n"
     unique_id = uuid.uuid4()
@@ -130,7 +118,6 @@ async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=
 # Gradio interface function
 async def process_script(script_text, pitch, rate, voice):
-    # Format pitch correctly for edge-tts
     pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
     formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
     srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
@@ -138,44 +125,8 @@ async def process_script(script_text, pitch, rate, voice):
 # Gradio interface setup
 voice_options = {
-   "Andrew Male": "en-US-AndrewNeural",
     "Jenny Female": "en-US-JennyNeural",
-    "Guy Male": "en-US-GuyNeural",
-    "Ana Female": "en-US-AnaNeural",
-    "Aria Female": "en-US-AriaNeural",
-    "Brian Male": "en-US-BrianNeural",
-    "Christopher Male": "en-US-ChristopherNeural",
-    "Eric Male": "en-US-EricNeural",
-    "Michelle Male": "en-US-MichelleNeural",
-    "Roger Male": "en-US-RogerNeural",
-    "Natasha Female": "en-AU-NatashaNeural",
-    "William Male": "en-AU-WilliamNeural",
-    "Clara Female": "en-CA-ClaraNeural",
-    "Liam Female ": "en-CA-LiamNeural",
-    "Libby Female": "en-GB-LibbyNeural",
-    "Maisie": "en-GB-MaisieNeural",
-    "Ryan": "en-GB-RyanNeural",
-    "Sonia": "en-GB-SoniaNeural",
-    "Thomas": "en-GB-ThomasNeural",
-    "Sam": "en-HK-SamNeural",
-    "Yan": "en-HK-YanNeural",
-    "Connor": "en-IE-ConnorNeural",
-    "Emily": "en-IE-EmilyNeural",
-    "Neerja": "en-IN-NeerjaNeural",
-    "Prabhat": "en-IN-PrabhatNeural",
-    "Asilia": "en-KE-AsiliaNeural",
-    "Chilemba": "en-KE-ChilembaNeural",
-    "Abeo": "en-NG-AbeoNeural",
-    "Ezinne": "en-NG-EzinneNeural",
-    "Mitchell": "en-NZ-MitchellNeural",
-    "James": "en-PH-JamesNeural",
-    "Rosa": "en-PH-RosaNeural",
-    "Luna": "en-SG-LunaNeural",
-    "Wayne": "en-SG-WayneNeural",
-    "Elimu": "en-TZ-ElimuNeural",
-    "Imani": "en-TZ-ImaniNeural",
-    "Leah": "en-ZA-LeahNeural",
-    "Luke": "en-ZA-LukeNeural"
     # Add other voices here...
 }
@@ -192,8 +143,8 @@ app = gr.Interface(
         gr.File(label="Download Audio File"),
         gr.Audio(label="Audio Playback")
     ],
-    title="HIVEcorp Text-to-Speech with SRT Generation",
-    description="Convert your script into audio and generate subtitles.",
     theme="compact",
 )

 import uuid
 import re
+# Function to get the length of an audio file in milliseconds
 def get_audio_length(audio_file):
     audio = AudioSegment.from_file(audio_file)
+    return len(audio) / 1000  # Return in seconds for compatibility
+# Function to format time for SRT in milliseconds
+def format_time_ms(milliseconds):
+    seconds, ms = divmod(int(milliseconds), 1000)
+    mins, secs = divmod(seconds, 60)
+    hrs, mins = divmod(mins, 60)
+    return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
+# Function to split text into segments based on punctuation, ensuring no word is split
 def split_text_into_segments(text):
     segments = []
     raw_segments = re.split(r'([.!?,])', text)
     for i in range(0, len(raw_segments) - 1, 2):
         sentence = raw_segments[i].strip() + raw_segments[i + 1]
         words = sentence.split()
         if len(words) <= 8:
             segments.append(sentence.strip())
         else:
             chunk = ""
             for word in words:
                 if len(chunk.split()) < 8:
             if chunk:
                 segments.append(chunk.strip())
     if len(raw_segments) % 2 == 1:
         remaining_text = raw_segments[-1].strip()
         if remaining_text:
     return segments
+# Function to generate SRT with millisecond accuracy per batch
 async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
     audio_file = f"batch_{batch_num}_audio.wav"
     tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
     await tts.save(audio_file)
+    actual_length = get_audio_length(audio_file) * 1000  # Convert to milliseconds
     segments = split_text_into_segments(batch_text)
+    segment_duration = actual_length / len(segments)
     start_time = start_offset
     srt_content = ""
     for index, segment in enumerate(segments):
         end_time = start_time + segment_duration
             end_time = start_offset + actual_length
         srt_content += f"{index + 1 + (batch_num * 100)}\n"
+        srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n"
         srt_content += segment + "\n\n"
         start_time = end_time
     return srt_content, audio_file, start_time
+# Batch processing function with millisecond accuracy
 async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
     batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
     all_srt_content = ""
             end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
             if end_time > total_audio_length:
                 end_time = total_audio_length
+            line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}"
         validated_srt_content += line + "\n"
     unique_id = uuid.uuid4()
 # Gradio interface function
 async def process_script(script_text, pitch, rate, voice):
     pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
     formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
     srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
 # Gradio interface setup
 voice_options = {
+    "Andrew Male": "en-US-AndrewNeural",
     "Jenny Female": "en-US-JennyNeural",
     # Add other voices here...
 }
         gr.File(label="Download Audio File"),
         gr.Audio(label="Audio Playback")
     ],
+    title="HIVEcorp Text-to-Speech with Millisecond SRT Generation",
+    description="Convert your script into audio and generate millisecond-accurate subtitles.",
     theme="compact",
 )