mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-08 04:42:57 +00:00
622 lines
16 KiB
Plaintext
622 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "42bf01fb",
|
|
"metadata": {},
|
|
"source": [
|
|
"# We're going to build question and answer bot\n",
|
|
"\n",
|
|
"That allow you to search through youtube transcripts using natural language"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "48547ddb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pip install --quiet openai datasets lancedb"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "22e570f4",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Download the data\n",
|
|
"700 videos and 208619 sentences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "a8987fcb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Dataset({\n",
|
|
" features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],\n",
|
|
" num_rows: 208619\n",
|
|
"})"
|
|
]
|
|
},
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"data = load_dataset('jamescalam/youtube-transcriptions', split='train')\n",
|
|
"data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5ac2b6a3",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Prepare context\n",
|
|
"\n",
|
|
"Create context of 20 sentences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "121a7087",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>published</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>video_id</th>\n",
|
|
" <th>channel_id</th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>text</th>\n",
|
|
" <th>start</th>\n",
|
|
" <th>end</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>177622</th>\n",
|
|
" <td>$5 MILLION AI for FREE</td>\n",
|
|
" <td>2022-08-12 15:18:07</td>\n",
|
|
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
|
|
" <td>3EjtHs_lXnk</td>\n",
|
|
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
|
|
" <td>3EjtHs_lXnk-t0.0</td>\n",
|
|
" <td>Imagine an AI where all in the same model you ...</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>24.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" title published \\\n",
|
|
"177622 $5 MILLION AI for FREE 2022-08-12 15:18:07 \n",
|
|
"\n",
|
|
" url video_id channel_id \\\n",
|
|
"177622 https://youtu.be/3EjtHs_lXnk 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ \n",
|
|
"\n",
|
|
" id text \\\n",
|
|
"177622 3EjtHs_lXnk-t0.0 Imagine an AI where all in the same model you ... \n",
|
|
"\n",
|
|
" start end \n",
|
|
"177622 0.0 24.0 "
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from lancedb.context import contextualize\n",
|
|
"\n",
|
|
"df = (contextualize(data.to_pandas())\n",
|
|
" .groupby(\"title\").text_col(\"text\")\n",
|
|
" .window(20).stride(4)\n",
|
|
" .to_df())\n",
|
|
"df.head(1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3044e0b0",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create embedding function\n",
|
|
"We'll call the OpenAI embeddings API to get embeddings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "c8104467",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import openai\n",
|
|
"import os\n",
|
|
"\n",
|
|
"# Configuring the environment variable OPENAI_API_KEY\n",
|
|
"if \"OPENAI_API_KEY\" not in os.environ:\n",
|
|
" # OR set the key here as a variable\n",
|
|
" openai.api_key = \"sk-...\"\n",
|
|
" \n",
|
|
"assert len(openai.Model.list()[\"data\"]) > 0"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "8eefc159",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"def embed_func(c): \n",
|
|
" rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n",
|
|
" return [record[\"embedding\"] for record in rs[\"data\"]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2106b5bb",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create the LanceDB Table"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "13f15068",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "c4fb6f5a4ccc40ddb89d9df497213292",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
" 0%| | 0/49 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>published</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>video_id</th>\n",
|
|
" <th>channel_id</th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>text</th>\n",
|
|
" <th>start</th>\n",
|
|
" <th>end</th>\n",
|
|
" <th>vector</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>$5 MILLION AI for FREE</td>\n",
|
|
" <td>2022-08-12 15:18:07</td>\n",
|
|
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
|
|
" <td>3EjtHs_lXnk</td>\n",
|
|
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
|
|
" <td>3EjtHs_lXnk-t0.0</td>\n",
|
|
" <td>Imagine an AI where all in the same model you ...</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>24.0</td>\n",
|
|
" <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" title published url \\\n",
|
|
"0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n",
|
|
"\n",
|
|
" video_id channel_id id \\\n",
|
|
"0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n",
|
|
"\n",
|
|
" text start end \\\n",
|
|
"0 Imagine an AI where all in the same model you ... 0.0 24.0 \n",
|
|
"\n",
|
|
" vector \n",
|
|
"0 [-0.024402587, -0.00087673456, 0.016499246, -0... "
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import lancedb\n",
|
|
"from lancedb.embeddings import with_embeddings\n",
|
|
"\n",
|
|
"data = with_embeddings(embed_func, df, show_progress=True)\n",
|
|
"data.to_pandas().head(1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "92d53abd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"48935"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"db = lancedb.connect(\"/tmp/lancedb\") # current directory\n",
|
|
"tbl = db.create_table(\"chatbot\", data)\n",
|
|
"len(tbl)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "22892cfd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>published</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>video_id</th>\n",
|
|
" <th>channel_id</th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>text</th>\n",
|
|
" <th>start</th>\n",
|
|
" <th>end</th>\n",
|
|
" <th>vector</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>$5 MILLION AI for FREE</td>\n",
|
|
" <td>2022-08-12 15:18:07</td>\n",
|
|
" <td>https://youtu.be/3EjtHs_lXnk</td>\n",
|
|
" <td>3EjtHs_lXnk</td>\n",
|
|
" <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
|
|
" <td>3EjtHs_lXnk-t0.0</td>\n",
|
|
" <td>Imagine an AI where all in the same model you ...</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>24.0</td>\n",
|
|
" <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" title published url \\\n",
|
|
"0 $5 MILLION AI for FREE 2022-08-12 15:18:07 https://youtu.be/3EjtHs_lXnk \n",
|
|
"\n",
|
|
" video_id channel_id id \\\n",
|
|
"0 3EjtHs_lXnk UCfzlCWGWYyIQ0aLC5w48gBQ 3EjtHs_lXnk-t0.0 \n",
|
|
"\n",
|
|
" text start end \\\n",
|
|
"0 Imagine an AI where all in the same model you ... 0.0 24.0 \n",
|
|
"\n",
|
|
" vector \n",
|
|
"0 [-0.024402587, -0.00087673456, 0.016499246, -0... "
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tbl.to_pandas().head(1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "23afc2f9",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create and answer the prompt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "06d8b867",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_prompt(query, context):\n",
|
|
" limit = 3750\n",
|
|
"\n",
|
|
" prompt_start = (\n",
|
|
" \"Answer the question based on the context below.\\n\\n\"+\n",
|
|
" \"Context:\\n\"\n",
|
|
" )\n",
|
|
" prompt_end = (\n",
|
|
" f\"\\n\\nQuestion: {query}\\nAnswer:\"\n",
|
|
" )\n",
|
|
" # append contexts until hitting limit\n",
|
|
" for i in range(1, len(context)):\n",
|
|
" if len(\"\\n\\n---\\n\\n\".join(context.text[:i])) >= limit:\n",
|
|
" prompt = (\n",
|
|
" prompt_start +\n",
|
|
" \"\\n\\n---\\n\\n\".join(context.text[:i-1]) +\n",
|
|
" prompt_end\n",
|
|
" )\n",
|
|
" break\n",
|
|
" elif i == len(context)-1:\n",
|
|
" prompt = (\n",
|
|
" prompt_start +\n",
|
|
" \"\\n\\n---\\n\\n\".join(context.text) +\n",
|
|
" prompt_end\n",
|
|
" ) \n",
|
|
" return prompt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "e09c5142",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def complete(prompt):\n",
|
|
" # query text-davinci-003\n",
|
|
" res = openai.Completion.create(\n",
|
|
" engine='text-davinci-003',\n",
|
|
" prompt=prompt,\n",
|
|
" temperature=0,\n",
|
|
" max_tokens=400,\n",
|
|
" top_p=1,\n",
|
|
" frequency_penalty=0,\n",
|
|
" presence_penalty=0,\n",
|
|
" stop=None\n",
|
|
" )\n",
|
|
" return res['choices'][0]['text'].strip()\n",
|
|
"\n",
|
|
"# check that it works\n",
|
|
"query = \"who was the 12th person on the moon and when did they land?\"\n",
|
|
"complete(query)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "28705959",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Use LanceDB to find the answer and show the video at the right place"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "c71f5b31",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"query = (\"Which training method should I use for sentence transformers \"\n",
|
|
" \"when I only have pairs of related sentences?\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "603ba92c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Embed the question\n",
|
|
"emb = embed_func(query)[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "80db5c15",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use LanceDB to get top 3 most relevant context\n",
|
|
"context = tbl.search(emb).limit(3).to_df()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "8fcef773",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'NLI with multiple negative ranking loss.'"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Get the answer from completion API\n",
|
|
"prompt = create_prompt(query, context)\n",
|
|
"complete(prompt)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "25714299",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"\n",
|
|
" <iframe\n",
|
|
" width=\"400\"\n",
|
|
" height=\"300\"\n",
|
|
" src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
|
|
" frameborder=\"0\"\n",
|
|
" allowfullscreen\n",
|
|
" \n",
|
|
" ></iframe>\n",
|
|
" "
|
|
],
|
|
"text/plain": [
|
|
"<IPython.lib.display.YouTubeVideo at 0x1258aeaa0>"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import YouTubeVideo\n",
|
|
"\n",
|
|
"top_match = context.iloc[0]\n",
|
|
"YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "78b7eb11",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|