yoon2566 commited on
Commit
5a2df22
Β·
verified Β·
1 Parent(s): 85830a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -77
app.py CHANGED
@@ -2,14 +2,37 @@ import gradio as gr
2
  import yt_dlp
3
  import re
4
  from datetime import timedelta
 
 
5
 
6
- def format_timestamp(seconds):
7
- """Convert seconds to MM:SS format"""
8
- return str(timedelta(seconds=seconds)).split(':')[1:3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def extract_transcript(url):
11
- """Extract transcript from YouTube video using yt-dlp"""
12
  try:
 
 
 
13
  ydl_opts = {
14
  'writesubtitles': True,
15
  'writeautomaticsub': True,
@@ -18,6 +41,10 @@ def extract_transcript(url):
18
  'quiet': True
19
  }
20
 
 
 
 
 
21
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
22
  # Get video info
23
  info = ydl.extract_info(url, download=False)
@@ -26,85 +53,67 @@ def extract_transcript(url):
26
  subtitles = info.get('subtitles', {})
27
  automatic_captions = info.get('automatic_captions', {})
28
 
29
- # Try to get subtitles in preferred order
30
- subtitle_text = None
31
-
32
- # 1. Try manual Korean subtitles
33
- if 'ko' in subtitles:
34
- for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
35
- for sub in subtitles['ko']:
36
- if sub.get('ext') == fmt:
37
- subtitle_text = ydl.write_debug_json(sub['url'])
38
- break
39
- if subtitle_text:
40
- break
41
-
42
- # 2. Try auto-generated Korean subtitles
43
- if not subtitle_text and 'ko' in automatic_captions:
44
- for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
45
- for sub in automatic_captions['ko']:
46
- if sub.get('ext') == fmt:
47
- subtitle_text = ydl.write_debug_json(sub['url'])
48
- break
49
- if subtitle_text:
50
- break
51
-
52
- # 3. Try English subtitles
53
- if not subtitle_text and 'en' in subtitles:
54
- for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
55
- for sub in subtitles['en']:
56
- if sub.get('ext') == fmt:
57
- subtitle_text = ydl.write_debug_json(sub['url'])
58
- break
59
- if subtitle_text:
60
- break
61
-
62
- # 4. Try auto-generated English subtitles
63
- if not subtitle_text and 'en' in automatic_captions:
64
- for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
65
- for sub in automatic_captions['en']:
66
- if sub.get('ext') == fmt:
67
- subtitle_text = ydl.write_debug_json(sub['url'])
68
- break
69
- if subtitle_text:
70
- break
71
-
72
- if not subtitle_text:
73
- return f"μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.\n제λͺ©: {info.get('title')}"
74
-
75
  # Format output
76
- formatted_output = f"제λͺ©: {info.get('title')}\n\n"
 
 
 
77
 
78
- # Parse WebVTT format
79
- lines = subtitle_text.split('\n')
80
- current_time = None
81
- current_text = []
 
 
 
82
 
83
- for line in lines:
84
- # Time stamp line
85
- if re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
86
- if current_time and current_text:
87
- mins, secs = current_time
88
- formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
89
-
90
- start_time = line.split(' --> ')[0]
91
- time_parts = start_time.split(':')
92
- seconds = int(time_parts[1]) * 60 + float(time_parts[2].split('.')[0])
93
- current_time = divmod(int(seconds), 60)
94
- current_text = []
95
- # Text line
96
- elif line.strip() and not line.startswith('WEBVTT'):
97
- current_text.append(line.strip() + ' ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- # Add last subtitle
100
- if current_time and current_text:
101
- mins, secs = current_time
102
- formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
103
 
 
 
 
 
104
  return formatted_output
105
 
106
  except Exception as e:
107
- return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
 
 
 
 
108
 
109
  # Create Gradio interface
110
  iface = gr.Interface(
@@ -117,8 +126,12 @@ iface = gr.Interface(
117
  label="μΆ”μΆœλœ 슀크립트",
118
  lines=20
119
  ),
120
- title="YouTube μžλ§‰ μΆ”μΆœκΈ° (yt-dlp 버전)",
121
- description="YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€. (ν•œκ΅­μ–΄ μš°μ„ , μ˜μ–΄ μ°¨μ„ )",
 
 
 
 
122
  allow_flagging="never"
123
  )
124
 
 
2
  import yt_dlp
3
  import re
4
  from datetime import timedelta
5
+ import os
6
+ import browser_cookie3
7
 
8
+ def get_cookies():
9
+ """Get YouTube cookies from browser"""
10
+ try:
11
+ # Try Chrome cookies first
12
+ cookies = browser_cookie3.chrome(domain_name='.youtube.com')
13
+ except:
14
+ try:
15
+ # Try Firefox cookies if Chrome fails
16
+ cookies = browser_cookie3.firefox(domain_name='.youtube.com')
17
+ except:
18
+ return None
19
+
20
+ # Convert cookies to Netscape format
21
+ cookie_path = '/tmp/youtube.txt'
22
+ with open(cookie_path, 'w') as f:
23
+ for cookie in cookies:
24
+ if cookie.domain == '.youtube.com':
25
+ f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t"
26
+ f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t"
27
+ f"{cookie.name}\t{cookie.value}\n")
28
+ return cookie_path
29
 
30
  def extract_transcript(url):
31
+ """Extract transcript from YouTube video using yt-dlp with cookies"""
32
  try:
33
+ # Get cookies
34
+ cookie_path = get_cookies()
35
+
36
  ydl_opts = {
37
  'writesubtitles': True,
38
  'writeautomaticsub': True,
 
41
  'quiet': True
42
  }
43
 
44
+ # Add cookies if available
45
+ if cookie_path:
46
+ ydl_opts['cookiefile'] = cookie_path
47
+
48
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
49
  # Get video info
50
  info = ydl.extract_info(url, download=False)
 
53
  subtitles = info.get('subtitles', {})
54
  automatic_captions = info.get('automatic_captions', {})
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # Format output
57
+ formatted_output = f"제λͺ©: {info.get('title', '제λͺ© μ—†μŒ')}\n\n"
58
+
59
+ # Process subtitles
60
+ subtitle_found = False
61
 
62
+ # Priority order for subtitles
63
+ subtitle_priorities = [
64
+ ('ko', subtitles), # Manual Korean
65
+ ('ko', automatic_captions), # Auto Korean
66
+ ('en', subtitles), # Manual English
67
+ ('en', automatic_captions) # Auto English
68
+ ]
69
 
70
+ for lang, sub_dict in subtitle_priorities:
71
+ if lang in sub_dict and not subtitle_found:
72
+ subs = sub_dict[lang]
73
+ if isinstance(subs, list) and subs:
74
+ subtitle_found = True
75
+
76
+ # Process each subtitle entry
77
+ for entry in subs:
78
+ if 'src' in entry: # JSON format
79
+ lines = entry['src'].split('\n')
80
+ current_time = None
81
+ current_text = []
82
+
83
+ for line in lines:
84
+ # Time stamp line
85
+ if re.match(r'\d{2}:\d{2}:\d{2}', line):
86
+ if current_time and current_text:
87
+ formatted_output += f"[{current_time}] {''.join(current_text)}\n"
88
+
89
+ time_parts = line.split(':')
90
+ current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}"
91
+ current_text = []
92
+ # Text line
93
+ elif line.strip() and not line.startswith('WEBVTT'):
94
+ current_text.append(line.strip() + ' ')
95
+
96
+ # Add last subtitle
97
+ if current_time and current_text:
98
+ formatted_output += f"[{current_time}] {''.join(current_text)}\n"
99
+ break
100
 
101
+ if not subtitle_found:
102
+ return "μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (μžλ™ 생성 μžλ§‰ 포함)"
 
 
103
 
104
+ # Clean up cookie file
105
+ if cookie_path and os.path.exists(cookie_path):
106
+ os.remove(cookie_path)
107
+
108
  return formatted_output
109
 
110
  except Exception as e:
111
+ error_msg = str(e)
112
+ if "Sign in to confirm your age" in error_msg:
113
+ return "μ—°λ Ή μ œν•œμ΄ μžˆλŠ” μ˜μƒμž…λ‹ˆλ‹€."
114
+ elif "confirm you're not a bot" in error_msg:
115
+ return "YouTubeκ°€ 봇 방지λ₯Ό μœ„ν•΄ 인증을 μš”κ΅¬ν•©λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
116
+ return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"
117
 
118
  # Create Gradio interface
119
  iface = gr.Interface(
 
126
  label="μΆ”μΆœλœ 슀크립트",
127
  lines=20
128
  ),
129
+ title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
130
+ description="""
131
+ YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€.
132
+ - ν•œκ΅­μ–΄ μžλ§‰ μš°μ„  (μˆ˜λ™ > μžλ™)
133
+ - μ˜μ–΄ μžλ§‰ μ°¨μ„  (μˆ˜λ™ > μžλ™)
134
+ """,
135
  allow_flagging="never"
136
  )
137