import gradio as gr import yt_dlp import re from datetime import timedelta import os import browser_cookie3 def get_cookies(): """Get YouTube cookies from browser""" try: # Try Chrome cookies first cookies = browser_cookie3.chrome(domain_name='.youtube.com') except: try: # Try Firefox cookies if Chrome fails cookies = browser_cookie3.firefox(domain_name='.youtube.com') except: return None # Convert cookies to Netscape format cookie_path = '/tmp/youtube.txt' with open(cookie_path, 'w') as f: for cookie in cookies: if cookie.domain == '.youtube.com': f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t" f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t" f"{cookie.name}\t{cookie.value}\n") return cookie_path def extract_transcript(url): """Extract transcript from YouTube video using yt-dlp with cookies""" try: # Get cookies cookie_path = get_cookies() ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['ko', 'en'], 'skip_download': True, 'quiet': True } # Add cookies if available if cookie_path: ydl_opts['cookiefile'] = cookie_path with yt_dlp.YoutubeDL(ydl_opts) as ydl: # Get video info info = ydl.extract_info(url, download=False) # Get available subtitles subtitles = info.get('subtitles', {}) automatic_captions = info.get('automatic_captions', {}) # Format output formatted_output = f"제목: {info.get('title', '제목 없음')}\n\n" # Process subtitles subtitle_found = False # Priority order for subtitles subtitle_priorities = [ ('ko', subtitles), # Manual Korean ('ko', automatic_captions), # Auto Korean ('en', subtitles), # Manual English ('en', automatic_captions) # Auto English ] for lang, sub_dict in subtitle_priorities: if lang in sub_dict and not subtitle_found: subs = sub_dict[lang] if isinstance(subs, list) and subs: subtitle_found = True # Process each subtitle entry for entry in subs: if 'src' in entry: # JSON format lines = entry['src'].split('\n') current_time = None current_text = [] for line in lines: # Time stamp line if re.match(r'\d{2}:\d{2}:\d{2}', line): if current_time and current_text: formatted_output += f"[{current_time}] {''.join(current_text)}\n" time_parts = line.split(':') current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}" current_text = [] # Text line elif line.strip() and not line.startswith('WEBVTT'): current_text.append(line.strip() + ' ') # Add last subtitle if current_time and current_text: formatted_output += f"[{current_time}] {''.join(current_text)}\n" break if not subtitle_found: return "자막을 찾을 수 없습니다. (자동 생성 자막 포함)" # Clean up cookie file if cookie_path and os.path.exists(cookie_path): os.remove(cookie_path) return formatted_output except Exception as e: error_msg = str(e) if "Sign in to confirm your age" in error_msg: return "연령 제한이 있는 영상입니다." elif "confirm you're not a bot" in error_msg: return "YouTube가 봇 방지를 위해 인증을 요구합니다. 잠시 후 다시 시도해주세요." return f"자막 추출 중 오류가 발생했습니다: {error_msg}" # Create Gradio interface iface = gr.Interface( fn=extract_transcript, inputs=gr.Textbox( label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..." ), outputs=gr.Textbox( label="추출된 스크립트", lines=20 ), title="YouTube 자막 추출기", description=""" YouTube 영상의 URL을 입력하면 자막을 추출합니다. - 한국어 자막 우선 (수동 > 자동) - 영어 자막 차선 (수동 > 자동) """, allow_flagging="never" ) # Launch the app if __name__ == "__main__": iface.launch(server_name="0.0.0.0")