script / app.py
yoon2566's picture
Update app.py
5a2df22 verified
import gradio as gr
import yt_dlp
import re
from datetime import timedelta
import os
import browser_cookie3
def get_cookies():
"""Get YouTube cookies from browser"""
try:
# Try Chrome cookies first
cookies = browser_cookie3.chrome(domain_name='.youtube.com')
except:
try:
# Try Firefox cookies if Chrome fails
cookies = browser_cookie3.firefox(domain_name='.youtube.com')
except:
return None
# Convert cookies to Netscape format
cookie_path = '/tmp/youtube.txt'
with open(cookie_path, 'w') as f:
for cookie in cookies:
if cookie.domain == '.youtube.com':
f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t"
f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t"
f"{cookie.name}\t{cookie.value}\n")
return cookie_path
def extract_transcript(url):
"""Extract transcript from YouTube video using yt-dlp with cookies"""
try:
# Get cookies
cookie_path = get_cookies()
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['ko', 'en'],
'skip_download': True,
'quiet': True
}
# Add cookies if available
if cookie_path:
ydl_opts['cookiefile'] = cookie_path
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
# Get video info
info = ydl.extract_info(url, download=False)
# Get available subtitles
subtitles = info.get('subtitles', {})
automatic_captions = info.get('automatic_captions', {})
# Format output
formatted_output = f"제λͺ©: {info.get('title', '제λͺ© μ—†μŒ')}\n\n"
# Process subtitles
subtitle_found = False
# Priority order for subtitles
subtitle_priorities = [
('ko', subtitles), # Manual Korean
('ko', automatic_captions), # Auto Korean
('en', subtitles), # Manual English
('en', automatic_captions) # Auto English
]
for lang, sub_dict in subtitle_priorities:
if lang in sub_dict and not subtitle_found:
subs = sub_dict[lang]
if isinstance(subs, list) and subs:
subtitle_found = True
# Process each subtitle entry
for entry in subs:
if 'src' in entry: # JSON format
lines = entry['src'].split('\n')
current_time = None
current_text = []
for line in lines:
# Time stamp line
if re.match(r'\d{2}:\d{2}:\d{2}', line):
if current_time and current_text:
formatted_output += f"[{current_time}] {''.join(current_text)}\n"
time_parts = line.split(':')
current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}"
current_text = []
# Text line
elif line.strip() and not line.startswith('WEBVTT'):
current_text.append(line.strip() + ' ')
# Add last subtitle
if current_time and current_text:
formatted_output += f"[{current_time}] {''.join(current_text)}\n"
break
if not subtitle_found:
return "μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (μžλ™ 생성 μžλ§‰ 포함)"
# Clean up cookie file
if cookie_path and os.path.exists(cookie_path):
os.remove(cookie_path)
return formatted_output
except Exception as e:
error_msg = str(e)
if "Sign in to confirm your age" in error_msg:
return "μ—°λ Ή μ œν•œμ΄ μžˆλŠ” μ˜μƒμž…λ‹ˆλ‹€."
elif "confirm you're not a bot" in error_msg:
return "YouTubeκ°€ 봇 방지λ₯Ό μœ„ν•΄ 인증을 μš”κ΅¬ν•©λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"
# Create Gradio interface
iface = gr.Interface(
fn=extract_transcript,
inputs=gr.Textbox(
label="YouTube URL",
placeholder="https://www.youtube.com/watch?v=..."
),
outputs=gr.Textbox(
label="μΆ”μΆœλœ 슀크립트",
lines=20
),
title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
description="""
YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€.
- ν•œκ΅­μ–΄ μžλ§‰ μš°μ„  (μˆ˜λ™ > μžλ™)
- μ˜μ–΄ μžλ§‰ μ°¨μ„  (μˆ˜λ™ > μžλ™)
""",
allow_flagging="never"
)
# Launch the app
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0")