Spaces:

sradc
/

visual-content-search-over-videos

Runtime error

App Files Files Community

sradc commited on May 21, 2023

Commit

1801c3b

0 Parent(s):

initial commit

Browse files

Files changed (21) hide show

.gitattributes +34 -0
.gitignore +164 -0
.streamlit/config.toml +5 -0
README.md +31 -0
_dev/clip.ipynb +146 -0
_dev/download_videos.ipynb +80 -0
_dev/process_video.ipynb +149 -0
_dev/run_search_over_videos.ipynb +0 -0
activate +1 -0
example.py +23 -0
pipeline/clip_wrapper.py +29 -0
pipeline/download_videos.py +38 -0
pipeline/get_video_ids.py +79 -0
pipeline/process_videos.py +66 -0
poetry.lock +0 -0
pyproject.toml +39 -0
run_pipeline.sh +5 -0
tests/pipeline/test_clip_wrapper.py +13 -0
tests/pipeline/test_download_videos.py +10 -0
video_semantic_search/__init__.py +0 -0
video_semantic_search/app.py +123 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+data
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[browser]
+gatherUsageStats = false
+[theme]
+base="dark"

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: Visual Content Search Over Videos
+emoji: 🐢
+colorFrom: yellow
+colorTo: green
+sdk: streamlit
+sdk_version: 1.19.0
+app_file: video_semantic_search/app.py
+pinned: false
+---
+# semvideo-hackathon-230521
+## Project Description
+This project let's you search YouTube videos using a text string. The search is done over the actual video frames,
+rather than any associated text. The search results are displayed as a list of videos, with the most relevant video
+shown first. The user can then click on any of the videos to play it.
+## Quick Start
+Run the following commands to get started:
+```bash
+git clone https://github.com/sradc/semvideo-hackathon-230521.git
+cd semvideo-hackathon-230521
+poetry install
+PYTHONPATH=. poetry run streamlit run video_semantic_search/app.py
+```
+If you do not have `poetry` installed, refer to the [poetry documentation](https://python-poetry.org/docs/#installation).

_dev/clip.ipynb ADDED Viewed

	@@ -0,0 +1,146 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "from transformers import CLIPModel, CLIPProcessor, CLIPFeatureExtractor\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ClipWrapper:\n",
+    "    def __init__(self):\n",
+    "        self.model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "        self.processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "\n",
+    "    def images2vec(self, images: List[Image.Image]) -> torch.Tensor:\n",
+    "        inputs = self.processor(images=images, return_tensors=\"pt\")\n",
+    "        with torch.no_grad():\n",
+    "            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()}\n",
+    "            image_embeds = self.model.vision_model(**model_inputs)\n",
+    "            clip_vectors = self.model.visual_projection(image_embeds[1])\n",
+    "        return clip_vectors / clip_vectors.norm(dim=-1, keepdim=True)\n",
+    "\n",
+    "    def texts2vec(self, texts: List[str]) -> torch.Tensor:\n",
+    "        inputs = self.processor(text=texts, return_tensors=\"pt\", padding=True)\n",
+    "        with torch.no_grad():\n",
+    "            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()}\n",
+    "            text_embeds = self.model.text_model(**model_inputs)\n",
+    "            text_vectors = self.model.text_projection(text_embeds[1])\n",
+    "        return text_vectors / text_vectors.norm(dim=-1, keepdim=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 512])"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def images2vec(images: List[Image.Image]) -> torch.Tensor:\n",
+    "    inputs = processor(images=images, return_tensors=\"pt\")\n",
+    "    with torch.no_grad():\n",
+    "        model_inputs = {k: v.to(model.device) for k, v in inputs.items()}\n",
+    "        image_embeds = model.vision_model(**model_inputs)\n",
+    "        clip_vectors = model.visual_projection(image_embeds[1])\n",
+    "    return clip_vectors / clip_vectors.norm(dim=-1, keepdim=True)\n",
+    "\n",
+    "\n",
+    "result = images2vec([image, image])\n",
+    "result.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 512])"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def texts2vec(texts: List[str]) -> torch.Tensor:\n",
+    "    inputs = processor(text=texts, return_tensors=\"pt\", padding=True)\n",
+    "    with torch.no_grad():\n",
+    "        model_inputs = {k: v.to(model.device) for k, v in inputs.items()}\n",
+    "        text_embeds = model.text_model(**model_inputs)\n",
+    "        text_vectors = model.text_projection(text_embeds[1])\n",
+    "    return text_vectors / text_vectors.norm(dim=-1, keepdim=True)\n",
+    "\n",
+    "\n",
+    "texts2vec([\"a photo of a cat\", \"a photo of a dog\"]).shape"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "semvideo-hackathon-230523",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

_dev/download_videos.ipynb ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "from pathlib import Path\n",
+    "import re\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import pipeline.videos as videos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:02<00:00,  2.37s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "VIDEO_DIR = Path(\"videos\")\n",
+    "VIDEO_DIR.mkdir(exist_ok=True)\n",
+    "(VIDEO_DIR / \".gitingore\").write_text(\"**\")\n",
+    "\n",
+    "video_urls = [\"https://www.youtube.com/watch?v=frYIj2FGmMA&foo=bar\"]\n",
+    "\n",
+    "\n",
+    "def get_id(url: str) -> str:\n",
+    "    return re.search(r\"(?<=v=)[^&]+\", url).group(0)\n",
+    "\n",
+    "\n",
+    "for video_url in tqdm(video_urls):\n",
+    "    video_id = get_id(video_url)\n",
+    "    video_path = VIDEO_DIR / f\"{video_id}.mp4\"\n",
+    "    if video_path.exists():\n",
+    "        print(f\"Skipping {video_path} because it already exists\")\n",
+    "        continue\n",
+    "    subprocess.run([\"yt-dlp\", \"--quiet\", \"-f\", \"133\", \"-o\", str(video_path), video_url])\n",
+    "\n",
+    "# get_id(video_urls[0])\n",
+    "# # !yt-dlp -f 133 -o \"buster.mp4\" {video_url}\n",
+    "# def download_video(video_url: str) -> None:\n",
+    "#     subprocess.run(['yt-dlp', '-f', '133', '-o', 'buster.mp4', video_url])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "semvideo-hackathon-230523",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

_dev/process_video.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sidneyradcliffe/miniforge3/envs/semvideo-hackathon-230523/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "import pandas as pd\n",
+    "import cv2\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "\n",
+    "from pipeline.clip_wrapper import ClipWrapper, MODEL_DIM\n",
+    "from pipeline.download_videos import VIDEO_DIR, REPO_ROOT, DATA_DIR\n",
+    "\n",
+    "FRAME_EXTRACT_RATE_SECONDS = 5  # Extract a frame every 5 seconds\n",
+    "IMAGES_DIR = DATA_DIR / \"images\"\n",
+    "\n",
+    "DATAFRAME_PATH = DATA_DIR / \"dataset.parquet\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clip_wrapper = ClipWrapper()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_clip_vectors(video_path):\n",
+    "    cap = cv2.VideoCapture(str(video_path))\n",
+    "    num_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
+    "    fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
+    "    extract_every_n_frames = FRAME_EXTRACT_RATE_SECONDS * fps\n",
+    "    for frame_idx in tqdm(range(num_video_frames), desc=\"Running CLIP on video\"):\n",
+    "        ret, frame = cap.read()\n",
+    "        if frame_idx % extract_every_n_frames != 0:\n",
+    "            continue\n",
+    "        image = Image.fromarray(frame[..., ::-1])\n",
+    "        clip_vector = clip_wrapper.images2vec([image]).squeeze().numpy()\n",
+    "        timestamp_secs = frame_idx / fps\n",
+    "        yield clip_vector, image, timestamp_secs, frame_idx\n",
+    "    cap.release()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running CLIP on video: 100%|██████████| 7465/7465 [00:04<00:00, 1759.86it/s]\n",
+      "Running CLIP on video: 100%|██████████| 6056/6056 [00:03<00:00, 1728.62it/s]\n",
+      "Running CLIP on video: 100%|██████████| 5234/5234 [00:03<00:00, 1648.12it/s]\n",
+      "Running CLIP on video: 100%|██████████| 3551/3551 [00:01<00:00, 1806.30it/s]\n",
+      "Running CLIP on video: 100%|██████████| 5904/5904 [00:03<00:00, 1655.01it/s]\n",
+      "Processing videos: 100%|██████████| 5/5 [00:16<00:00,  3.30s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving data to /Users/sidneyradcliffe/repos/semvideo-hackathon-230523/data/dataset.parquet\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = []\n",
+    "for i, video_path in enumerate(\n",
+    "    tqdm(list(VIDEO_DIR.glob(\"*.mp4\")), desc=\"Processing videos\")\n",
+    "):\n",
+    "    video_id = video_path.stem\n",
+    "    extracted_images_dir = IMAGES_DIR / video_id\n",
+    "    extracted_images_dir.mkdir(exist_ok=True, parents=True)\n",
+    "    for clip_vector, image, timestamp_secs, frame_idx in get_clip_vectors(video_path):\n",
+    "        image_path = extracted_images_dir / f\"{frame_idx}.jpg\"\n",
+    "        image.save(image_path)\n",
+    "        results.append(\n",
+    "            [\n",
+    "                video_id,\n",
+    "                frame_idx,\n",
+    "                timestamp_secs,\n",
+    "                str(image_path.relative_to(REPO_ROOT)),\n",
+    "                *clip_vector,\n",
+    "            ]\n",
+    "        )\n",
+    "df = pd.DataFrame(\n",
+    "    results,\n",
+    "    columns=[\"video_id\", \"frame_idx\", \"timestamp\", \"image_path\"]\n",
+    "    + [f\"dim_{i}\" for i in range(MODEL_DIM)],\n",
+    ")\n",
+    "print(f\"Saving data to {DATAFRAME_PATH}\")\n",
+    "df.to_parquet(DATAFRAME_PATH, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "semvideo-hackathon-230523",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

_dev/run_search_over_videos.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

activate ADDED Viewed

	@@ -0,0 +1 @@


1	+ conda activate semvideo-hackathon-230523

example.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import requests
+from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(
+    text=["a photo of a cat", "a photo of a dog"],
+    images=image,
+    return_tensors="pt",
+    padding=True,
+)
+outputs = model(**inputs)
+logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+probs = logits_per_image.softmax(
+    dim=1
+)  # we can take the softmax to get the label probabilities
+print(probs)

pipeline/clip_wrapper.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List
+import torch
+from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
+MODEL_DIM = 512
+class ClipWrapper:
+    def __init__(self):
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    def images2vec(self, images: List[Image.Image]) -> torch.Tensor:
+        inputs = self.processor(images=images, return_tensors="pt")
+        with torch.no_grad():
+            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            image_embeds = self.model.vision_model(**model_inputs)
+            clip_vectors = self.model.visual_projection(image_embeds[1])
+        return clip_vectors / clip_vectors.norm(dim=-1, keepdim=True)
+    def texts2vec(self, texts: List[str]) -> torch.Tensor:
+        inputs = self.processor(text=texts, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            text_embeds = self.model.text_model(**model_inputs)
+            text_vectors = self.model.text_projection(text_embeds[1])
+        return text_vectors / text_vectors.norm(dim=-1, keepdim=True)

pipeline/download_videos.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import re
+import subprocess
+from pathlib import Path
+from typing import List
+from tqdm import tqdm
+REPO_ROOT = Path(__file__).parents[1].resolve()
+DATA_DIR = REPO_ROOT / "data"
+VIDEO_DIR = DATA_DIR / "videos"
+VIDEO_ID_FOLDER = DATA_DIR / "ids"
+def get_id(url: str) -> str:
+    return re.search(r"(?<=v=)[^&]+", url).group(0)
+def download_videos(video_ids: List[str]) -> None:
+    VIDEO_DIR.mkdir(exist_ok=True, parents=True)
+    for video_id in tqdm(video_ids):
+        video_url = f"https://www.youtube.com/watch?v={video_id}"
+        video_path = VIDEO_DIR / f"{video_id}.mp4"
+        if video_path.exists():
+            print(f"Skipping {video_path} because it already exists")
+            continue
+        subprocess.run(
+            ["yt-dlp", "--quiet", "-f", "135", "-o", str(video_path), video_url]
+        )
+if __name__ == "__main__":
+    print("Downloading videos...")
+    ids = set()
+    for file in VIDEO_ID_FOLDER.glob("*.txt"):
+        ids.update(
+            [x for x in file.read_text().strip().splitlines(keepends=False) if x]
+        )
+    download_videos(ids)

pipeline/get_video_ids.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import hashlib
+import logging
+import os
+from pathlib import Path
+from typing import Final, Optional
+import youtube_dl
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+PLAYLIST_URLS = [
+    "https://www.youtube.com/playlist?list=PL6Lt9p1lIRZ311J9ZHuzkR5A3xesae2pk",  # 570, Alternative rock of the 2000s (2000-2009)
+    "https://www.youtube.com/playlist?list=PLMC9KNkIncKtGvr2kFRuXBVmBev6cAJ2u",  # 250, Best Pop Music Videos - Top Pop Hits Playlist
+    "https://www.youtube.com/playlist?list=PLmXxqSJJq-yXrCPGIT2gn8b34JjOrl4Xf",  #  184, 80s Music Hits | Best 80s Music Playlist
+    "https://www.youtube.com/playlist?list=PL7DA3D097D6FDBC02",  #  150, 90's Hits - Greatest 1990's Music Hits (Best 90’s Songs Playlist)
+    "https://www.youtube.com/playlist?list=PLeDakahyfrO-4kuBioL5ZAoy4j6aCnzWy",  # 100, Best Music Videos of All Time
+    "https://www.youtube.com/playlist?list=PLMC9KNkIncKtPzgY-5rmhvj7fax8fdxoj",  #  200, Pop Music Playlist - Timeless Pop Songs (Updated Weekly 2023)
+    "https://www.youtube.com/playlist?list=PLkqz3S84Tw-RfPS9HHi3MRmrinOBKxIr8",  # 82, Top POP Hits 2022 – Biggest Pop Music Videos - Vevo
+    "https://www.youtube.com/playlist?list=PLyORnIW1xT6wqvszJbCdLdSjylYMf3sNZ",  # 100, Top 100 Music Videos 2023 - Best Music Videos 2023
+    "https://www.youtube.com/playlist?list=PL1Mmsa-U48mea1oIN-Eus78giJANx4D9W",  # 119, 90s Music Videos
+    "https://www.youtube.com/playlist?list=PLurPBtLcqJqcg3r-HOhR3LZ0aDxpI15Fa",  # 100, 100 Best Music Videos Of The Decade: 2010 - 2019
+    "https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj",  # 495, MTV Classic 2000's music videos (US Version)
+]
+URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE")
+OUTPUT_DIR: Final[str] = os.environ.get("OUTPUT_DIR", "data/ids")
+def get_all_video_ids(channel_url: str) -> list[str]:
+    """Get all video IDs from a YouTube channel or playlist URL.
+    Args:
+        channel_url (str): URL of the YouTube channel or playlist.
+    Returns:
+        list[str]: List of video IDs.
+    Notes:
+        If you want the videos from a channel, make sure to pass the `/videos` endpoint of the channel.
+    """
+    ydl_opts = {
+        "ignoreerrors": True,
+        "extract_flat": "in_playlist",
+        "dump_single_json": True,
+        "quiet": True,
+    }
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        playlist_info = ydl.extract_info(channel_url, download=False)
+        video_ids = [video["id"] for video in playlist_info["entries"] if "id" in video]
+    return video_ids
+def process_youtube_url(url: str):
+    logging.info(f"Processing {url}")
+    ids = get_all_video_ids(url)
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output = "\n".join(ids)
+    output_path = output_dir / f"{hashlib.md5(output.encode()).hexdigest()}.txt"
+    logging.info(f"Writing {len(ids)} video IDs to {output_path}")
+    with output_path.open(mode="w") as f:
+        f.write(output)
+def main():
+    logging.info(f"Processing {len(PLAYLIST_URLS)} URLs")
+    for url in PLAYLIST_URLS:
+        process_youtube_url(url)
+if __name__ == "__main__":
+    main()

pipeline/process_videos.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import cv2
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+from pipeline.clip_wrapper import MODEL_DIM, ClipWrapper
+from pipeline.download_videos import DATA_DIR, REPO_ROOT, VIDEO_DIR
+FRAME_EXTRACT_RATE_SECONDS = 5  # Extract a frame every 5 seconds
+IMAGES_DIR = DATA_DIR / "images"
+DATAFRAME_PATH = DATA_DIR / "dataset.parquet"
+def process_videos() -> None:
+    "Runs clip on video frames, saves results to a parquet file"
+    clip_wrapper = ClipWrapper()
+    results = []
+    for video_path in tqdm(list(VIDEO_DIR.glob("*.mp4")), desc="Processing videos"):
+        video_id = video_path.stem
+        extracted_images_dir = IMAGES_DIR / video_id
+        extracted_images_dir.mkdir(exist_ok=True, parents=True)
+        complete_file = extracted_images_dir / "complete"
+        if complete_file.exists():
+            continue
+        for clip_vector, image, timestamp_secs, frame_idx in get_clip_vectors(
+            video_path, clip_wrapper
+        ):
+            image_path = extracted_images_dir / f"{frame_idx}.jpg"
+            image.save(image_path)
+            results.append(
+                [
+                    video_id,
+                    frame_idx,
+                    timestamp_secs,
+                    str(image_path.relative_to(REPO_ROOT)),
+                    *clip_vector,
+                ]
+            )
+        complete_file.touch()
+    df = pd.DataFrame(
+        results,
+        columns=["video_id", "frame_idx", "timestamp", "image_path"]
+        + [f"dim_{i}" for i in range(MODEL_DIM)],
+    )
+    print(f"Saving data to {DATAFRAME_PATH}")
+    df.to_parquet(DATAFRAME_PATH, index=False)
+def get_clip_vectors(video_path, clip_wrapper):
+    cap = cv2.VideoCapture(str(video_path))
+    num_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    extract_every_n_frames = FRAME_EXTRACT_RATE_SECONDS * fps
+    for frame_idx in tqdm(range(num_video_frames), desc="Running CLIP on video"):
+        ret, frame = cap.read()
+        if frame_idx % extract_every_n_frames != 0:
+            continue
+        image = Image.fromarray(frame[..., ::-1])
+        clip_vector = clip_wrapper.images2vec([image]).squeeze().numpy()
+        timestamp_secs = frame_idx / fps
+        yield clip_vector, image, timestamp_secs, frame_idx
+    cap.release()
+if __name__ == "__main__":
+    process_videos()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[tool.poetry]
+name = "video-semantic-search"
+version = "0.1.0"
+description = ""
+authors = ["Ben Tenmann <[email protected]>", "Sidney Radcliffe <[email protected]>"]
+license = "MIT"
+readme = "README.md"
+packages = [{include = "video_semantic_search"}]
+[tool.poetry.dependencies]
+python = ">=3.9,<3.9.7 || >3.9.7,<4.0"
+streamlit = "^1.22.0"
+pandas = "^2.0.1"
+pyarrow = "^12.0.0"
+# need to pin faiss-cpu to 1.6.5 because of segfaults when interacting with streamlit
+# https://github.com/facebookresearch/faiss/issues/2099#issuecomment-961172708
+# sidney use 1.7.4
+faiss-cpu = "==1.7.4"
+transformers = "^4.29.2"
+torch = "^2.0.1"
+torchvision = "^0.15.2"
+urllib3 = "1.26.15"
+yt-dlp = "^2023.3.4"
+tqdm = "^4.65.0"
+opencv-python = "^4.7.0.72"
+youtube-dl = "^2021.12.17"
+[tool.poetry.group.dev.dependencies]
+notebook = "^6.5.4"
+black = {extras = ["jupyter"], version = "^23.3.0"}
+isort = "^5.12.0"
+pytest = "^7.3.1"
+jupyterlab = "^4.0.0"
+nbconvert = "^7.4.0"
+jupyter-contrib-nbextensions = "^0.7.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

run_pipeline.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/env bash
+set -e
+poetry run python pipeline/download_videos.py
+poetry run python pipeline/process_videos.py

tests/pipeline/test_clip_wrapper.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from pipeline.clip_wrapper import ClipWrapper
+def test_ClipWrapper():
+    clip_wrapper = ClipWrapper()
+    images = [torch.rand(3, 224, 224) for _ in range(2)]
+    assert clip_wrapper.images2vec(images).shape[-1] == 512
+    texts = ["a photo of a cat", "a photo of a dog"]
+    assert clip_wrapper.texts2vec(texts).shape[-1] == 512

tests/pipeline/test_download_videos.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pipeline.download_videos import get_id
+def test_get_id():
+    url1 = "https://www.youtube.com/watch?v=frYIj2FGmMA&foo=bar"
+    url2 = "https://www.youtube.com/watch?v=abcdefg"
+    url3 = "https://www.youtube.com/watch?foo=bar&v=xyz123"
+    assert get_id(url1) == "frYIj2FGmMA"
+    assert get_id(url2) == "abcdefg"
+    assert get_id(url3) == "xyz123"

video_semantic_search/__init__.py ADDED Viewed

File without changes

video_semantic_search/app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import base64
+import os
+from dataclasses import dataclass
+from typing import Final
+import faiss
+import numpy as np
+import pandas as pd
+import streamlit as st
+from pipeline import clip_wrapper
+class SemanticSearcher:
+    def __init__(self, dataset: pd.DataFrame):
+        dim_columns = dataset.filter(regex="^dim_").columns
+        self.embedder = clip_wrapper.ClipWrapper().texts2vec
+        self.metadata = dataset.drop(columns=dim_columns)
+        self.index = faiss.IndexFlatIP(len(dim_columns))
+        self.index.add(np.ascontiguousarray(dataset[dim_columns].to_numpy(np.float32)))
+    def search(self, query: str) -> list["SearchResult"]:
+        v = self.embedder([query]).detach().numpy()
+        D, I = self.index.search(v, 10)
+        return [
+            SearchResult(
+                video_id=row["video_id"],
+                frame_idx=row["frame_idx"],
+                timestamp=row["timestamp"],
+                score=score,
+            )
+            for score, (_, row) in zip(D[0], self.metadata.iloc[I[0]].iterrows())
+        ]
+DATASET_PATH: Final[str] = os.environ.get("DATASET_PATH", "data/dataset.parquet")
+SEARCHER: Final[SemanticSearcher] = SemanticSearcher(pd.read_parquet(DATASET_PATH))
+@dataclass
+class SearchResult:
+    video_id: str
+    frame_idx: int
+    timestamp: float
+    score: float
+def get_video_url(video_id: str, timestamp: float) -> str:
+    return f"https://www.youtube.com/watch?v={video_id}&t={int(timestamp)}"
+def display_search_results(results: list[SearchResult]) -> None:
+    col_count = 3  # Number of videos per row
+    col_num = 0  # Counter to keep track of the current column
+    row = st.empty()  # Placeholder for the current row
+    for i, result in enumerate(results):
+        if col_num == 0:
+            row = st.columns(col_count)  # Create a new row of columns
+        with row[col_num]:
+            # Apply CSS styling to the video container
+            st.markdown(
+                """
+                <style>
+                .video-container {
+                    position: relative;
+                    padding-bottom: 56.25%;
+                    padding-top: 30px;
+                    height: 0;
+                    overflow: hidden;
+                }
+                .video-container iframe,
+                .video-container object,
+                .video-container embed {
+                    position: absolute;
+                    top: 0;
+                    left: 0;
+                    width: 100%;
+                    height: 100%;
+                }
+                </style>
+                """,
+                unsafe_allow_html=True,
+            )
+            # Display the embedded YouTube video
+            # st.video(get_video_url(result.video_id), start_time=int(result.timestamp))
+            # st.image(f"data/images/{result.video_id}/{result.frame_idx}.jpg")
+            with open(
+                f"data/images/{result.video_id}/{result.frame_idx}.jpg", "rb"
+            ) as f:
+                image = f.read()
+                encoded = base64.b64encode(image).decode()
+            st.markdown(
+                f"""
+                <a href="{get_video_url(result.video_id, result.timestamp)}">
+                <img src="data:image/jpeg;base64,{encoded}" alt="frame {result.frame_idx}" width="100%">
+                </a>
+                """,
+                unsafe_allow_html=True,
+            )
+        col_num += 1
+        if col_num >= col_count:
+            col_num = 0
+def main():
+    st.set_page_config(page_title="video-semantic-search", layout="wide")
+    st.header("Video Semantic Search")
+    st.text_input("What are you looking for?", key="query")
+    query = st.session_state["query"]
+    if query:
+        display_search_results(SEARCHER.search(query))
+if __name__ == "__main__":
+    main()