Spaces:
Runtime error
Runtime error
File size: 1,134 Bytes
1801c3b 44efe1c 1801c3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import re
import subprocess
from pathlib import Path
from typing import List
from tqdm import tqdm
REPO_ROOT = Path(__file__).parents[1].resolve()
DATA_DIR = REPO_ROOT / "data"
VIDEO_DIR = DATA_DIR / "videos"
VIDEO_ID_FOLDER = DATA_DIR / "ids"
def get_id(url: str) -> str:
return re.search(r"(?<=v=)[^&]+", url).group(0)
def download_videos(video_ids: List[str]) -> None:
VIDEO_DIR.mkdir(exist_ok=True, parents=True)
(VIDEO_DIR / ".gitignore").write_text("*")
for video_id in tqdm(video_ids):
video_url = f"https://www.youtube.com/watch?v={video_id}"
video_path = VIDEO_DIR / f"{video_id}.mp4"
if video_path.exists():
print(f"Skipping {video_path} because it already exists")
continue
subprocess.run(
["yt-dlp", "--quiet", "-f", "135", "-o", str(video_path), video_url]
)
if __name__ == "__main__":
print("Downloading videos...")
ids = set()
for file in VIDEO_ID_FOLDER.glob("*.txt"):
ids.update(
[x for x in file.read_text().strip().splitlines(keepends=False) if x]
)
download_videos(ids)
|