Files
lancedb/notebooks/youtube_transcript_search.ipynb
Chang She 404211d4fb fix 3.11
2023-03-24 19:00:22 -07:00

622 lines
16 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "42bf01fb",
"metadata": {},
"source": [
"# We're going to build question and answer bot\n",
"\n",
"That allow you to search through youtube transcripts using natural language"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48547ddb",
"metadata": {},
"outputs": [],
"source": [
"pip install --quiet openai datasets lancedb"
]
},
{
"cell_type": "markdown",
"id": "22e570f4",
"metadata": {},
"source": [
"## Download the data\n",
"700 videos and 208619 sentences"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a8987fcb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
]
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],\n",
" num_rows: 208619\n",
"})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"data = load_dataset('jamescalam/youtube-transcriptions', split='train')\n",
"data"
]
},
{
"cell_type": "markdown",
"id": "5ac2b6a3",
"metadata": {},
"source": [
"## Prepare context\n",
"\n",
"Create context of 20 sentences"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "121a7087",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>published</th>\n",
" <th>url</th>\n",
" <th>video_id</th>\n",
" <th>channel_id</th>\n",
" <th>id</th>\n",
" <th>text</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>177622</th>\n",
" <td>$5 MILLION AI for FREE</td>\n",
" <td>2022-08-12 15:18:07</td>\n",
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
" <td>3EjtHs_lXnk</td>\n",
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
" <td>3EjtHs_lXnk-t0.0</td>\n",
" <td>Imagine an AI where all in the same model you ...</td>\n",
" <td>0.0</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title published \\\n",
"177622 $5 MILLION AI for FREE 2022-08-12 15:18:07 \n",
"\n",
" url video_id channel_id \\\n",
"177622 https://youtu.be/3EjtHs_lXnk 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ \n",
"\n",
" id text \\\n",
"177622 3EjtHs_lXnk-t0.0 Imagine an AI where all in the same model you ... \n",
"\n",
" start end \n",
"177622 0.0 24.0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from lancedb.context import contextualize\n",
"\n",
"df = (contextualize(data.to_pandas())\n",
" .groupby(\"title\").text_col(\"text\")\n",
" .window(20).stride(4)\n",
" .to_df())\n",
"df.head(1)"
]
},
{
"cell_type": "markdown",
"id": "3044e0b0",
"metadata": {},
"source": [
"## Create embedding function\n",
"We'll call the OpenAI embeddings API to get embeddings"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c8104467",
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"import os\n",
"\n",
"# Configuring the environment variable OPENAI_API_KEY\n",
"if \"OPENAI_API_KEY\" not in os.environ:\n",
" # OR set the key here as a variable\n",
" openai.api_key = \"sk-...\"\n",
" \n",
"assert len(openai.Model.list()[\"data\"]) > 0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8eefc159",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"def embed_func(c): \n",
" rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n",
" return [record[\"embedding\"] for record in rs[\"data\"]]"
]
},
{
"cell_type": "markdown",
"id": "2106b5bb",
"metadata": {},
"source": [
"## Create the LanceDB Table"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "13f15068",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c4fb6f5a4ccc40ddb89d9df497213292",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/49 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>published</th>\n",
" <th>url</th>\n",
" <th>video_id</th>\n",
" <th>channel_id</th>\n",
" <th>id</th>\n",
" <th>text</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>vector</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>$5 MILLION AI for FREE</td>\n",
" <td>2022-08-12 15:18:07</td>\n",
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
" <td>3EjtHs_lXnk</td>\n",
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
" <td>3EjtHs_lXnk-t0.0</td>\n",
" <td>Imagine an AI where all in the same model you ...</td>\n",
" <td>0.0</td>\n",
" <td>24.0</td>\n",
" <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title published url \\\n",
"0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n",
"\n",
" video_id channel_id id \\\n",
"0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n",
"\n",
" text start end \\\n",
"0 Imagine an AI where all in the same model you ... 0.0 24.0 \n",
"\n",
" vector \n",
"0 [-0.024402587, -0.00087673456, 0.016499246, -0... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import lancedb\n",
"from lancedb.embeddings import with_embeddings\n",
"\n",
"data = with_embeddings(embed_func, df, show_progress=True)\n",
"data.to_pandas().head(1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "92d53abd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"48935"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db = lancedb.connect(\"/tmp/lancedb\") # current directory\n",
"tbl = db.create_table(\"chatbot\", data)\n",
"len(tbl)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "22892cfd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>published</th>\n",
" <th>url</th>\n",
" <th>video_id</th>\n",
" <th>channel_id</th>\n",
" <th>id</th>\n",
" <th>text</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" <th>vector</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>$5 MILLION AI for FREE</td>\n",
" <td>2022-08-12 15:18:07</td>\n",
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
" <td>3EjtHs_lXnk</td>\n",
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
" <td>3EjtHs_lXnk-t0.0</td>\n",
" <td>Imagine an AI where all in the same model you ...</td>\n",
" <td>0.0</td>\n",
" <td>24.0</td>\n",
" <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title published url \\\n",
"0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n",
"\n",
" video_id channel_id id \\\n",
"0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n",
"\n",
" text start end \\\n",
"0 Imagine an AI where all in the same model you ... 0.0 24.0 \n",
"\n",
" vector \n",
"0 [-0.024402587, -0.00087673456, 0.016499246, -0... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tbl.to_pandas().head(1)"
]
},
{
"cell_type": "markdown",
"id": "23afc2f9",
"metadata": {},
"source": [
"## Create and answer the prompt"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "06d8b867",
"metadata": {},
"outputs": [],
"source": [
"def create_prompt(query, context):\n",
" limit = 3750\n",
"\n",
" prompt_start = (\n",
" \"Answer the question based on the context below.\\n\\n\"+\n",
" \"Context:\\n\"\n",
" )\n",
" prompt_end = (\n",
" f\"\\n\\nQuestion: {query}\\nAnswer:\"\n",
" )\n",
" # append contexts until hitting limit\n",
" for i in range(1, len(context)):\n",
" if len(\"\\n\\n---\\n\\n\".join(context.text[:i])) >= limit:\n",
" prompt = (\n",
" prompt_start +\n",
" \"\\n\\n---\\n\\n\".join(context.text[:i-1]) +\n",
" prompt_end\n",
" )\n",
" break\n",
" elif i == len(context)-1:\n",
" prompt = (\n",
" prompt_start +\n",
" \"\\n\\n---\\n\\n\".join(context.text) +\n",
" prompt_end\n",
" ) \n",
" return prompt"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e09c5142",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def complete(prompt):\n",
" # query text-davinci-003\n",
" res = openai.Completion.create(\n",
" engine='text-davinci-003',\n",
" prompt=prompt,\n",
" temperature=0,\n",
" max_tokens=400,\n",
" top_p=1,\n",
" frequency_penalty=0,\n",
" presence_penalty=0,\n",
" stop=None\n",
" )\n",
" return res['choices'][0]['text'].strip()\n",
"\n",
"# check that it works\n",
"query = \"who was the 12th person on the moon and when did they land?\"\n",
"complete(query)"
]
},
{
"cell_type": "markdown",
"id": "28705959",
"metadata": {},
"source": [
"## Use LanceDB to find the answer and show the video at the right place"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c71f5b31",
"metadata": {},
"outputs": [],
"source": [
"query = (\"Which training method should I use for sentence transformers \"\n",
" \"when I only have pairs of related sentences?\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "603ba92c",
"metadata": {},
"outputs": [],
"source": [
"# Embed the question\n",
"emb = embed_func(query)[0]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "80db5c15",
"metadata": {},
"outputs": [],
"source": [
"# Use LanceDB to get top 3 most relevant context\n",
"context = tbl.search(emb).limit(3).to_df()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8fcef773",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'NLI with multiple negative ranking loss.'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the answer from completion API\n",
"prompt = create_prompt(query, context)\n",
"complete(prompt)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "25714299",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"400\"\n",
" height=\"300\"\n",
" src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.YouTubeVideo at 0x1258aeaa0>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import YouTubeVideo\n",
"\n",
"top_match = context.iloc[0]\n",
"YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78b7eb11",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}