lancedb/notebooks/youtube_transcript_search.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "42bf01fb",
   "metadata": {},
   "source": [
    "# We're going to build question and answer bot\n",
    "\n",
    "That allow you to search through youtube transcripts using natural language"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48547ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install --quiet openai datasets lancedb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22e570f4",
   "metadata": {},
   "source": [
    "## Download the data\n",
    "700 videos and 208619 sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a8987fcb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],\n",
       "    num_rows: 208619\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "data = load_dataset('jamescalam/youtube-transcriptions', split='train')\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ac2b6a3",
   "metadata": {},
   "source": [
    "## Prepare context\n",
    "\n",
    "Create context of 20 sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "121a7087",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>published</th>\n",
       "      <th>url</th>\n",
       "      <th>video_id</th>\n",
       "      <th>channel_id</th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>start</th>\n",
       "      <th>end</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>177622</th>\n",
       "      <td>$5 MILLION AI for FREE</td>\n",
       "      <td>2022-08-12 15:18:07</td>\n",
       "      <td>https://youtu.be/3EjtHs_lXnk</td>\n",
       "      <td>3EjtHs_lXnk</td>\n",
       "      <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
       "      <td>3EjtHs_lXnk-t0.0</td>\n",
       "      <td>Imagine an AI where all in the same model you ...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title            published  \\\n",
       "177622  $5 MILLION AI for FREE  2022-08-12 15:18:07   \n",
       "\n",
       "                                 url     video_id                channel_id  \\\n",
       "177622  https://youtu.be/3EjtHs_lXnk  3EjtHs_lXnk  UCfzlCWGWYyIQ0aLC5w48gBQ   \n",
       "\n",
       "                      id                                               text  \\\n",
       "177622  3EjtHs_lXnk-t0.0  Imagine an AI where all in the same model you ...   \n",
       "\n",
       "        start   end  \n",
       "177622    0.0  24.0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from lancedb.context import contextualize\n",
    "\n",
    "df = (contextualize(data.to_pandas())\n",
    "      .groupby(\"title\").text_col(\"text\")\n",
    "      .window(20).stride(4)\n",
    "      .to_df())\n",
    "df.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3044e0b0",
   "metadata": {},
   "source": [
    "## Create embedding function\n",
    "We'll call the OpenAI embeddings API to get embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8eefc159",
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
    "# Configure environment variable OPENAI_API_KEY\n",
    "# OR add variable openai.api_key = \"sk-...\"\n",
    "\n",
    "def embed_func(c):    \n",
    "    rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n",
    "    return [record[\"embedding\"] for record in rs[\"data\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2106b5bb",
   "metadata": {},
   "source": [
    "## Create the LanceDB Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "13f15068",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building vector index: IVF64,OPQ96, metric=l2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<lance.dataset.LanceDataset at 0x13fd38dc0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 16384 out of 48935 to train kmeans of 1536 dim, 64 clusters\n"
     ]
    }
   ],
   "source": [
    "import lancedb\n",
    "from lancedb.embeddings import with_embeddings\n",
    "\n",
    "data = with_embeddings(embed_func, df, show_progress=True)\n",
    "\n",
    "db = lancedb.connect(\"/tmp/lancedb\")  # current directory\n",
    "tbl = db.create_table(\"chatbot\", data)\n",
    "tbl.create_index(num_partitions=64, num_sub_vectors=96)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23afc2f9",
   "metadata": {},
   "source": [
    "## Create and answer the prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "06d8b867",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prompt(query, context):\n",
    "    limit = 3750\n",
    "\n",
    "    prompt_start = (\n",
    "        \"Answer the question based on the context below.\\n\\n\"+\n",
    "        \"Context:\\n\"\n",
    "    )\n",
    "    prompt_end = (\n",
    "        f\"\\n\\nQuestion: {query}\\nAnswer:\"\n",
    "    )\n",
    "    # append contexts until hitting limit\n",
    "    for i in range(1, len(context)):\n",
    "        if len(\"\\n\\n---\\n\\n\".join(context.text[:i])) >= limit:\n",
    "            prompt = (\n",
    "                prompt_start +\n",
    "                \"\\n\\n---\\n\\n\".join(context.text[:i-1]) +\n",
    "                prompt_end\n",
    "            )\n",
    "            break\n",
    "        elif i == len(context)-1:\n",
    "            prompt = (\n",
    "                prompt_start +\n",
    "                \"\\n\\n---\\n\\n\".join(context.text) +\n",
    "                prompt_end\n",
    "            )    \n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e09c5142",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def complete(prompt):\n",
    "    # query text-davinci-003\n",
    "    res = openai.Completion.create(\n",
    "        engine='text-davinci-003',\n",
    "        prompt=prompt,\n",
    "        temperature=0,\n",
    "        max_tokens=400,\n",
    "        top_p=1,\n",
    "        frequency_penalty=0,\n",
    "        presence_penalty=0,\n",
    "        stop=None\n",
    "    )\n",
    "    return res['choices'][0]['text'].strip()\n",
    "\n",
    "# check that it works\n",
    "query = \"who was the 12th person on the moon and when did they land?\"\n",
    "complete(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8fcef773",
   "metadata": {},
   "outputs": [],
   "source": [
    "def answer(question):\n",
    "    emb = embed_func(query)[0]\n",
    "    context = (tbl.search(emb).limit(3)\n",
    "               .nprobes(20).refine_factor(100)\n",
    "               .to_df())\n",
    "    prompt = create_prompt(question, context)\n",
    "    return complete(prompt), context.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28705959",
   "metadata": {},
   "source": [
    "## Show the answer and show the video at the right place"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "25714299",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLI with multiple negative ranking loss.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"400\"\n",
       "            height=\"300\"\n",
       "            src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.YouTubeVideo at 0x12f58afb0>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import YouTubeVideo\n",
    "\n",
    "query = (\"Which training method should I use for sentence transformers \"\n",
    "         \"when I only have pairs of related sentences?\")\n",
    "completion, context = answer(query)\n",
    "\n",
    "print(completion)\n",
    "top_match = context.iloc[0]\n",
    "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78b7eb11",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}