Files
lancedb/notebooks/youtube_transcript_search.ipynb
Chang She b91139d3c7 Add tutorial notebook
Convert contextualization and embeddings functionality.
And use it with converted notebook for video search
2023-03-23 15:07:58 -07:00

419 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "42bf01fb",
"metadata": {},
"source": [
"# We're going to build question and answer bot\n",
"\n",
"That allow you to search through youtube transcripts using natural language"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48547ddb",
"metadata": {},
"outputs": [],
"source": [
"pip install --quiet openai datasets lancedb"
]
},
{
"cell_type": "markdown",
"id": "22e570f4",
"metadata": {},
"source": [
"## Download the data\n",
"700 videos and 208619 sentences"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a8987fcb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
]
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],\n",
" num_rows: 208619\n",
"})"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"data = load_dataset('jamescalam/youtube-transcriptions', split='train')\n",
"data"
]
},
{
"cell_type": "markdown",
"id": "5ac2b6a3",
"metadata": {},
"source": [
"## Prepare context\n",
"\n",
"Create context of 20 sentences"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "121a7087",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>published</th>\n",
" <th>url</th>\n",
" <th>video_id</th>\n",
" <th>channel_id</th>\n",
" <th>id</th>\n",
" <th>text</th>\n",
" <th>start</th>\n",
" <th>end</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>177622</th>\n",
" <td>$5 MILLION AI for FREE</td>\n",
" <td>2022-08-12 15:18:07</td>\n",
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
" <td>3EjtHs_lXnk</td>\n",
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
" <td>3EjtHs_lXnk-t0.0</td>\n",
" <td>Imagine an AI where all in the same model you ...</td>\n",
" <td>0.0</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title published \\\n",
"177622 $5 MILLION AI for FREE 2022-08-12 15:18:07 \n",
"\n",
" url video_id channel_id \\\n",
"177622 https://youtu.be/3EjtHs_lXnk 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ \n",
"\n",
" id text \\\n",
"177622 3EjtHs_lXnk-t0.0 Imagine an AI where all in the same model you ... \n",
"\n",
" start end \n",
"177622 0.0 24.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from lancedb.context import contextualize\n",
"\n",
"df = (contextualize(data.to_pandas())\n",
" .groupby(\"title\").text_col(\"text\")\n",
" .window(20).stride(4)\n",
" .to_df())\n",
"df.head(1)"
]
},
{
"cell_type": "markdown",
"id": "3044e0b0",
"metadata": {},
"source": [
"## Create embedding function\n",
"We'll call the OpenAI embeddings API to get embeddings"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8eefc159",
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"\n",
"# Configure environment variable OPENAI_API_KEY\n",
"# OR add variable openai.api_key = \"sk-...\"\n",
"\n",
"def embed_func(c): \n",
" rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n",
" return [record[\"embedding\"] for record in rs[\"data\"]]"
]
},
{
"cell_type": "markdown",
"id": "2106b5bb",
"metadata": {},
"source": [
"## Create the LanceDB Table"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "13f15068",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Building vector index: IVF64,OPQ96, metric=l2\n"
]
},
{
"data": {
"text/plain": [
"<lance.dataset.LanceDataset at 0x13fd38dc0>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sample 16384 out of 48935 to train kmeans of 1536 dim, 64 clusters\n"
]
}
],
"source": [
"import lancedb\n",
"from lancedb.embeddings import with_embeddings\n",
"\n",
"data = with_embeddings(embed_func, df, show_progress=True)\n",
"\n",
"db = lancedb.connect(\"/tmp/lancedb\") # current directory\n",
"tbl = db.create_table(\"chatbot\", data)\n",
"tbl.create_index(num_partitions=64, num_sub_vectors=96)"
]
},
{
"cell_type": "markdown",
"id": "23afc2f9",
"metadata": {},
"source": [
"## Create and answer the prompt"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "06d8b867",
"metadata": {},
"outputs": [],
"source": [
"def create_prompt(query, context):\n",
" limit = 3750\n",
"\n",
" prompt_start = (\n",
" \"Answer the question based on the context below.\\n\\n\"+\n",
" \"Context:\\n\"\n",
" )\n",
" prompt_end = (\n",
" f\"\\n\\nQuestion: {query}\\nAnswer:\"\n",
" )\n",
" # append contexts until hitting limit\n",
" for i in range(1, len(context)):\n",
" if len(\"\\n\\n---\\n\\n\".join(context.text[:i])) >= limit:\n",
" prompt = (\n",
" prompt_start +\n",
" \"\\n\\n---\\n\\n\".join(context.text[:i-1]) +\n",
" prompt_end\n",
" )\n",
" break\n",
" elif i == len(context)-1:\n",
" prompt = (\n",
" prompt_start +\n",
" \"\\n\\n---\\n\\n\".join(context.text) +\n",
" prompt_end\n",
" ) \n",
" return prompt"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e09c5142",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def complete(prompt):\n",
" # query text-davinci-003\n",
" res = openai.Completion.create(\n",
" engine='text-davinci-003',\n",
" prompt=prompt,\n",
" temperature=0,\n",
" max_tokens=400,\n",
" top_p=1,\n",
" frequency_penalty=0,\n",
" presence_penalty=0,\n",
" stop=None\n",
" )\n",
" return res['choices'][0]['text'].strip()\n",
"\n",
"# check that it works\n",
"query = \"who was the 12th person on the moon and when did they land?\"\n",
"complete(query)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8fcef773",
"metadata": {},
"outputs": [],
"source": [
"def answer(question):\n",
" emb = embed_func(query)[0]\n",
" context = (tbl.search(emb).limit(3)\n",
" .nprobes(20).refine_factor(100)\n",
" .to_df())\n",
" prompt = create_prompt(question, context)\n",
" return complete(prompt), context.reset_index()"
]
},
{
"cell_type": "markdown",
"id": "28705959",
"metadata": {},
"source": [
"## Show the answer and show the video at the right place"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "25714299",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NLI with multiple negative ranking loss.\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"400\"\n",
" height=\"300\"\n",
" src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.YouTubeVideo at 0x12f58afb0>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import YouTubeVideo\n",
"\n",
"query = (\"Which training method should I use for sentence transformers \"\n",
" \"when I only have pairs of related sentences?\")\n",
"completion, context = answer(query)\n",
"\n",
"print(completion)\n",
"top_match = context.iloc[0]\n",
"YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78b7eb11",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}