|
import gradio as gr |
|
import yt_dlp |
|
import re |
|
from datetime import timedelta |
|
import os |
|
import browser_cookie3 |
|
|
|
def get_cookies(): |
|
"""Get YouTube cookies from browser""" |
|
try: |
|
|
|
cookies = browser_cookie3.chrome(domain_name='.youtube.com') |
|
except: |
|
try: |
|
|
|
cookies = browser_cookie3.firefox(domain_name='.youtube.com') |
|
except: |
|
return None |
|
|
|
|
|
cookie_path = '/tmp/youtube.txt' |
|
with open(cookie_path, 'w') as f: |
|
for cookie in cookies: |
|
if cookie.domain == '.youtube.com': |
|
f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t" |
|
f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t" |
|
f"{cookie.name}\t{cookie.value}\n") |
|
return cookie_path |
|
|
|
def extract_transcript(url): |
|
"""Extract transcript from YouTube video using yt-dlp with cookies""" |
|
try: |
|
|
|
cookie_path = get_cookies() |
|
|
|
ydl_opts = { |
|
'writesubtitles': True, |
|
'writeautomaticsub': True, |
|
'subtitleslangs': ['ko', 'en'], |
|
'skip_download': True, |
|
'quiet': True |
|
} |
|
|
|
|
|
if cookie_path: |
|
ydl_opts['cookiefile'] = cookie_path |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
|
|
info = ydl.extract_info(url, download=False) |
|
|
|
|
|
subtitles = info.get('subtitles', {}) |
|
automatic_captions = info.get('automatic_captions', {}) |
|
|
|
|
|
formatted_output = f"μ λͺ©: {info.get('title', 'μ λͺ© μμ')}\n\n" |
|
|
|
|
|
subtitle_found = False |
|
|
|
|
|
subtitle_priorities = [ |
|
('ko', subtitles), |
|
('ko', automatic_captions), |
|
('en', subtitles), |
|
('en', automatic_captions) |
|
] |
|
|
|
for lang, sub_dict in subtitle_priorities: |
|
if lang in sub_dict and not subtitle_found: |
|
subs = sub_dict[lang] |
|
if isinstance(subs, list) and subs: |
|
subtitle_found = True |
|
|
|
|
|
for entry in subs: |
|
if 'src' in entry: |
|
lines = entry['src'].split('\n') |
|
current_time = None |
|
current_text = [] |
|
|
|
for line in lines: |
|
|
|
if re.match(r'\d{2}:\d{2}:\d{2}', line): |
|
if current_time and current_text: |
|
formatted_output += f"[{current_time}] {''.join(current_text)}\n" |
|
|
|
time_parts = line.split(':') |
|
current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}" |
|
current_text = [] |
|
|
|
elif line.strip() and not line.startswith('WEBVTT'): |
|
current_text.append(line.strip() + ' ') |
|
|
|
|
|
if current_time and current_text: |
|
formatted_output += f"[{current_time}] {''.join(current_text)}\n" |
|
break |
|
|
|
if not subtitle_found: |
|
return "μλ§μ μ°Ύμ μ μμ΅λλ€. (μλ μμ± μλ§ ν¬ν¨)" |
|
|
|
|
|
if cookie_path and os.path.exists(cookie_path): |
|
os.remove(cookie_path) |
|
|
|
return formatted_output |
|
|
|
except Exception as e: |
|
error_msg = str(e) |
|
if "Sign in to confirm your age" in error_msg: |
|
return "μ°λ Ή μ νμ΄ μλ μμμ
λλ€." |
|
elif "confirm you're not a bot" in error_msg: |
|
return "YouTubeκ° λ΄ λ°©μ§λ₯Ό μν΄ μΈμ¦μ μꡬν©λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ." |
|
return f"μλ§ μΆμΆ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {error_msg}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=extract_transcript, |
|
inputs=gr.Textbox( |
|
label="YouTube URL", |
|
placeholder="https://www.youtube.com/watch?v=..." |
|
), |
|
outputs=gr.Textbox( |
|
label="μΆμΆλ μ€ν¬λ¦½νΈ", |
|
lines=20 |
|
), |
|
title="YouTube μλ§ μΆμΆκΈ°", |
|
description=""" |
|
YouTube μμμ URLμ μ
λ ₯νλ©΄ μλ§μ μΆμΆν©λλ€. |
|
- νκ΅μ΄ μλ§ μ°μ (μλ > μλ) |
|
- μμ΄ μλ§ μ°¨μ (μλ > μλ) |
|
""", |
|
allow_flagging="never" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch(server_name="0.0.0.0") |