Spaces:

yoon2566
/

script

Running

App Files Files Community

yoon2566 commited on 4 days ago

Commit

5a2df22

verified ·

1 Parent(s): 85830a8

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -77

app.py CHANGED Viewed

@@ -2,14 +2,37 @@ import gradio as gr
 import yt_dlp
 import re
 from datetime import timedelta
-def format_timestamp(seconds):
-    """Convert seconds to MM:SS format"""
-    return str(timedelta(seconds=seconds)).split(':')[1:3]
 def extract_transcript(url):
-    """Extract transcript from YouTube video using yt-dlp"""
     try:
         ydl_opts = {
             'writesubtitles': True,
             'writeautomaticsub': True,
@@ -18,6 +41,10 @@ def extract_transcript(url):
             'quiet': True
         }
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             # Get video info
             info = ydl.extract_info(url, download=False)
@@ -26,85 +53,67 @@ def extract_transcript(url):
             subtitles = info.get('subtitles', {})
             automatic_captions = info.get('automatic_captions', {})
-            # Try to get subtitles in preferred order
-            subtitle_text = None
-            # 1. Try manual Korean subtitles
-            if 'ko' in subtitles:
-                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
-                    for sub in subtitles['ko']:
-                        if sub.get('ext') == fmt:
-                            subtitle_text = ydl.write_debug_json(sub['url'])
-                            break
-                    if subtitle_text:
-                        break
-            # 2. Try auto-generated Korean subtitles
-            if not subtitle_text and 'ko' in automatic_captions:
-                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
-                    for sub in automatic_captions['ko']:
-                        if sub.get('ext') == fmt:
-                            subtitle_text = ydl.write_debug_json(sub['url'])
-                            break
-                    if subtitle_text:
-                        break
-            # 3. Try English subtitles
-            if not subtitle_text and 'en' in subtitles:
-                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
-                    for sub in subtitles['en']:
-                        if sub.get('ext') == fmt:
-                            subtitle_text = ydl.write_debug_json(sub['url'])
-                            break
-                    if subtitle_text:
-                        break
-            # 4. Try auto-generated English subtitles
-            if not subtitle_text and 'en' in automatic_captions:
-                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
-                    for sub in automatic_captions['en']:
-                        if sub.get('ext') == fmt:
-                            subtitle_text = ydl.write_debug_json(sub['url'])
-                            break
-                    if subtitle_text:
-                        break
-            if not subtitle_text:
-                return f"자막을 찾을 수 없습니다.\n제목: {info.get('title')}"
             # Format output
-            formatted_output = f"제목: {info.get('title')}\n\n"
-            # Parse WebVTT format
-            lines = subtitle_text.split('\n')
-            current_time = None
-            current_text = []
-            for line in lines:
-                # Time stamp line
-                if re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
-                    if current_time and current_text:
-                        mins, secs = current_time
-                        formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
-                    start_time = line.split(' --> ')[0]
-                    time_parts = start_time.split(':')
-                    seconds = int(time_parts[1]) * 60 + float(time_parts[2].split('.')[0])
-                    current_time = divmod(int(seconds), 60)
-                    current_text = []
-                # Text line
-                elif line.strip() and not line.startswith('WEBVTT'):
-                    current_text.append(line.strip() + ' ')
-            # Add last subtitle
-            if current_time and current_text:
-                mins, secs = current_time
-                formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
             return formatted_output
     except Exception as e:
-        return f"자막 추출 중 오류가 발생했습니다: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(
@@ -117,8 +126,12 @@ iface = gr.Interface(
         label="추출된 스크립트",
         lines=20
     ),
-    title="YouTube 자막 추출기 (yt-dlp 버전)",
-    description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
     allow_flagging="never"
 )

 import yt_dlp
 import re
 from datetime import timedelta
+import os
+import browser_cookie3
+def get_cookies():
+    """Get YouTube cookies from browser"""
+    try:
+        # Try Chrome cookies first
+        cookies = browser_cookie3.chrome(domain_name='.youtube.com')
+    except:
+        try:
+            # Try Firefox cookies if Chrome fails
+            cookies = browser_cookie3.firefox(domain_name='.youtube.com')
+        except:
+            return None
+    # Convert cookies to Netscape format
+    cookie_path = '/tmp/youtube.txt'
+    with open(cookie_path, 'w') as f:
+        for cookie in cookies:
+            if cookie.domain == '.youtube.com':
+                f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t"
+                       f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t"
+                       f"{cookie.name}\t{cookie.value}\n")
+    return cookie_path
 def extract_transcript(url):
+    """Extract transcript from YouTube video using yt-dlp with cookies"""
     try:
+        # Get cookies
+        cookie_path = get_cookies()
         ydl_opts = {
             'writesubtitles': True,
             'writeautomaticsub': True,
             'quiet': True
         }
+        # Add cookies if available
+        if cookie_path:
+            ydl_opts['cookiefile'] = cookie_path
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             # Get video info
             info = ydl.extract_info(url, download=False)
             subtitles = info.get('subtitles', {})
             automatic_captions = info.get('automatic_captions', {})
             # Format output
+            formatted_output = f"제목: {info.get('title', '제목 없음')}\n\n"
+            # Process subtitles
+            subtitle_found = False
+            # Priority order for subtitles
+            subtitle_priorities = [
+                ('ko', subtitles),           # Manual Korean
+                ('ko', automatic_captions),  # Auto Korean
+                ('en', subtitles),           # Manual English
+                ('en', automatic_captions)   # Auto English
+            ]
+            for lang, sub_dict in subtitle_priorities:
+                if lang in sub_dict and not subtitle_found:
+                    subs = sub_dict[lang]
+                    if isinstance(subs, list) and subs:
+                        subtitle_found = True
+                        # Process each subtitle entry
+                        for entry in subs:
+                            if 'src' in entry:  # JSON format
+                                lines = entry['src'].split('\n')
+                                current_time = None
+                                current_text = []
+                                for line in lines:
+                                    # Time stamp line
+                                    if re.match(r'\d{2}:\d{2}:\d{2}', line):
+                                        if current_time and current_text:
+                                            formatted_output += f"[{current_time}] {''.join(current_text)}\n"
+                                        time_parts = line.split(':')
+                                        current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}"
+                                        current_text = []
+                                    # Text line
+                                    elif line.strip() and not line.startswith('WEBVTT'):
+                                        current_text.append(line.strip() + ' ')
+                                # Add last subtitle
+                                if current_time and current_text:
+                                    formatted_output += f"[{current_time}] {''.join(current_text)}\n"
+                                break
+            if not subtitle_found:
+                return "자막을 찾을 수 없습니다. (자동 생성 자막 포함)"
+            # Clean up cookie file
+            if cookie_path and os.path.exists(cookie_path):
+                os.remove(cookie_path)
             return formatted_output
     except Exception as e:
+        error_msg = str(e)
+        if "Sign in to confirm your age" in error_msg:
+            return "연령 제한이 있는 영상입니다."
+        elif "confirm you're not a bot" in error_msg:
+            return "YouTube가 봇 방지를 위해 인증을 요구합니다. 잠시 후 다시 시도해주세요."
+        return f"자막 추출 중 오류가 발생했습니다: {error_msg}"
 # Create Gradio interface
 iface = gr.Interface(
         label="추출된 스크립트",
         lines=20
     ),
+    title="YouTube 자막 추출기",
+    description="""
+    YouTube 영상의 URL을 입력하면 자막을 추출합니다.
+    - 한국어 자막 우선 (수동 > 자동)
+    - 영어 자막 차선 (수동 > 자동)
+    """,
     allow_flagging="never"
 )