bump version for v0.0.3

Merge pull request #16 from lancedb/jaichopra/readme-lance-benchmarking
add Lance benchmarking blog
2025-12-25 06:19:57 +00:00 · 2023-03-30 19:19:29 -07:00 · 2023-03-30 19:04:44 -07:00 · 2023-03-29 11:44:36 -07:00 · 2023-03-24 19:50:35 -07:00 · 2023-03-24 19:45:46 -07:00
6 changed files with 357 additions and 72 deletions
--- a/README.md
+++ b/README.md
@@ -47,3 +47,7 @@ table = db.create_table("my_table",
                               {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
 result = table.search([100, 100]).limit(2).to_df()
 ```
+
+## Blogs, Tutorials & Videos
+* 📈 <a href="https://blog.eto.ai/benchmarking-random-access-in-lance-ed690757a826">2000x better performance with Lance over Parquet</a>
+* 🤖 <a href="https://github.com/lancedb/lancedb/blob/main/notebooks/youtube_transcript_search.ipynb">Build a question and answer bot with LanceDB</a>
--- a/notebooks/youtube_transcript_search.ipynb
+++ b/notebooks/youtube_transcript_search.ipynb
@@ -31,7 +31,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "id": "a8987fcb",
   "metadata": {},
   "outputs": [
@@ -51,7 +51,7 @@
       "})"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -75,7 +75,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "121a7087",
   "metadata": {},
   "outputs": [
@@ -142,7 +142,7 @@
       "177622    0.0  24.0  "
      ]
     },
-     "execution_count": 3,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -166,6 +166,24 @@
    "We'll call the OpenAI embeddings API to get embeddings"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c8104467",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import os\n",
+    "\n",
+    "# Configuring the environment variable OPENAI_API_KEY\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    # OR set the key here as a variable\n",
+    "    openai.api_key = \"sk-...\"\n",
+    "    \n",
+    "assert len(openai.Model.list()[\"data\"]) > 0"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 4,
@@ -173,11 +191,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import openai\n",
-    "\n",
-    "# Configure environment variable OPENAI_API_KEY\n",
-    "# OR add variable openai.api_key = \"sk-...\"\n",
-    "\n",
+    "import numpy as np\n",
    "def embed_func(c):    \n",
    "    rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n",
    "    return [record[\"embedding\"] for record in rs[\"data\"]]"
@@ -193,33 +207,94 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
   "id": "13f15068",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Building vector index: IVF64,OPQ96, metric=l2\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c4fb6f5a4ccc40ddb89d9df497213292",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/49 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    },
    {
     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>published</th>\n",
+       "      <th>url</th>\n",
+       "      <th>video_id</th>\n",
+       "      <th>channel_id</th>\n",
+       "      <th>id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>start</th>\n",
+       "      <th>end</th>\n",
+       "      <th>vector</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>$5 MILLION AI for FREE</td>\n",
+       "      <td>2022-08-12 15:18:07</td>\n",
+       "      <td>https://youtu.be/3EjtHs_lXnk</td>\n",
+       "      <td>3EjtHs_lXnk</td>\n",
+       "      <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
+       "      <td>3EjtHs_lXnk-t0.0</td>\n",
+       "      <td>Imagine an AI where all in the same model you ...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
      "text/plain": [
-       "<lance.dataset.LanceDataset at 0x13fd38dc0>"
+       "                    title            published                           url  \\\n",
+       "0  $5 MILLION AI for FREE  2022-08-12 15:18:07  https://youtu.be/3EjtHs_lXnk   \n",
+       "\n",
+       "      video_id                channel_id                id  \\\n",
+       "0  3EjtHs_lXnk  UCfzlCWGWYyIQ0aLC5w48gBQ  3EjtHs_lXnk-t0.0   \n",
+       "\n",
+       "                                                text  start   end  \\\n",
+       "0  Imagine an AI where all in the same model you ...    0.0  24.0   \n",
+       "\n",
+       "                                              vector  \n",
+       "0  [-0.024402587, -0.00087673456, 0.016499246, -0...  "
      ]
     },
-     "execution_count": 7,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sample 16384 out of 48935 to train kmeans of 1536 dim, 64 clusters\n"
-     ]
    }
   ],
   "source": [
@@ -227,10 +302,110 @@
    "from lancedb.embeddings import with_embeddings\n",
    "\n",
    "data = with_embeddings(embed_func, df, show_progress=True)\n",
-    "\n",
+    "data.to_pandas().head(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "92d53abd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "48935"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
    "db = lancedb.connect(\"/tmp/lancedb\")  # current directory\n",
    "tbl = db.create_table(\"chatbot\", data)\n",
-    "tbl.create_index(num_partitions=64, num_sub_vectors=96)"
+    "len(tbl)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "22892cfd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>published</th>\n",
+       "      <th>url</th>\n",
+       "      <th>video_id</th>\n",
+       "      <th>channel_id</th>\n",
+       "      <th>id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>start</th>\n",
+       "      <th>end</th>\n",
+       "      <th>vector</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>$5 MILLION AI for FREE</td>\n",
+       "      <td>2022-08-12 15:18:07</td>\n",
+       "      <td>https://youtu.be/3EjtHs_lXnk</td>\n",
+       "      <td>3EjtHs_lXnk</td>\n",
+       "      <td>UCfzlCWGWYyIQ0aLC5w48gBQ</td>\n",
+       "      <td>3EjtHs_lXnk-t0.0</td>\n",
+       "      <td>Imagine an AI where all in the same model you ...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>[-0.024402587, -0.00087673456, 0.016499246, -0...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    title            published                           url  \\\n",
+       "0  $5 MILLION AI for FREE  2022-08-12 15:18:07  https://youtu.be/3EjtHs_lXnk   \n",
+       "\n",
+       "      video_id                channel_id                id  \\\n",
+       "0  3EjtHs_lXnk  UCfzlCWGWYyIQ0aLC5w48gBQ  3EjtHs_lXnk-t0.0   \n",
+       "\n",
+       "                                                text  start   end  \\\n",
+       "0  Imagine an AI where all in the same model you ...    0.0  24.0   \n",
+       "\n",
+       "                                              vector  \n",
+       "0  [-0.024402587, -0.00087673456, 0.016499246, -0...  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tbl.to_pandas().head(1)"
   ]
  },
  {
@@ -313,43 +488,76 @@
    "complete(query)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "8fcef773",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def answer(question):\n",
-    "    emb = embed_func(query)[0]\n",
-    "    context = (tbl.search(emb).limit(3)\n",
-    "               .nprobes(20).refine_factor(100)\n",
-    "               .to_df())\n",
-    "    prompt = create_prompt(question, context)\n",
-    "    return complete(prompt), context.reset_index()"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "28705959",
   "metadata": {},
   "source": [
-    "## Show the answer and show the video at the right place"
+    "## Use LanceDB to find the answer and show the video at the right place"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c71f5b31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = (\"Which training method should I use for sentence transformers \"\n",
+    "         \"when I only have pairs of related sentences?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "603ba92c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Embed the question\n",
+    "emb = embed_func(query)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "80db5c15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use LanceDB to get top 3 most relevant context\n",
+    "context = tbl.search(emb).limit(3).to_df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
-   "id": "25714299",
+   "id": "8fcef773",
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "NLI with multiple negative ranking loss.\n"
-     ]
-    },
+     "data": {
+      "text/plain": [
+       "'NLI with multiple negative ranking loss.'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Get the answer from completion API\n",
+    "prompt = create_prompt(query, context)\n",
+    "complete(prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "25714299",
+   "metadata": {},
+   "outputs": [
    {
     "data": {
      "text/html": [
@@ -365,10 +573,10 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.YouTubeVideo at 0x12f58afb0>"
+       "<IPython.lib.display.YouTubeVideo at 0x1258aeaa0>"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -376,11 +584,6 @@
   "source": [
    "from IPython.display import YouTubeVideo\n",
    "\n",
-    "query = (\"Which training method should I use for sentence transformers \"\n",
-    "         \"when I only have pairs of related sentences?\")\n",
-    "completion, context = answer(query)\n",
-    "\n",
-    "print(completion)\n",
    "top_match = context.iloc[0]\n",
    "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
   ]
--- a/python/lancedb/embeddings.py
+++ b/python/lancedb/embeddings.py
@@ -12,7 +12,6 @@
 #  limitations under the License.

 import math
-import ratelimiter
 from retry import retry
 from typing import Callable, Union

@@ -32,11 +31,12 @@ def with_embeddings(
 ):
    func = EmbeddingFunction(func)
    if wrap_api:
-        func = func.retry().rate_limit().batch_size(batch_size)
+        func = func.retry().rate_limit()
+    func = func.batch_size(batch_size)
    if show_progress:
        func = func.show_progress()
    if isinstance(data, pd.DataFrame):
-        data = pa.Table.from_pandas(data)
+        data = pa.Table.from_pandas(data, preserve_index=False)
    embeddings = func(data[column].to_numpy())
    table = vec_to_table(np.array(embeddings))
    return data.append_column("vector", table["vector"])
@@ -52,23 +52,38 @@ class EmbeddingFunction:

    def __call__(self, text):
        # Get the embedding with retry
-        @retry(**self.retry_kwargs)
-        def embed_func(c):
-            return self.func(c.tolist())
+        if len(self.retry_kwargs) > 0:

-        max_calls = self.rate_limiter_kwargs["max_calls"]
-        limiter = ratelimiter.RateLimiter(
-            max_calls, period=self.rate_limiter_kwargs["period"]
-        )
-        rate_limited = limiter(embed_func)
+            @retry(**self.retry_kwargs)
+            def embed_func(c):
+                return self.func(c.tolist())
+
+        else:
+
+            def embed_func(c):
+                return self.func(c.tolist())
+
+        if len(self.rate_limiter_kwargs) > 0:
+            import ratelimiter
+
+            max_calls = self.rate_limiter_kwargs["max_calls"]
+            limiter = ratelimiter.RateLimiter(
+                max_calls, period=self.rate_limiter_kwargs["period"]
+            )
+            embed_func = limiter(embed_func)
        batches = self.to_batches(text)
-        embeds = [emb for c in batches for emb in rate_limited(c)]
+        embeds = [emb for c in batches for emb in embed_func(c)]
        return embeds

    def __repr__(self):
        return f"EmbeddingFunction(func={self.func})"

    def rate_limit(self, max_calls=0.9, period=1.0):
+        import sys
+
+        v = int(sys.version_info.minor)
+        if v >= 11:
+            raise ValueError("rate limit only support up to 3.10")
        self.rate_limiter_kwargs = dict(max_calls=max_calls, period=period)
        return self

@@ -102,4 +117,4 @@ class EmbeddingFunction:

            yield from tqdm(_chunker(arr), total=math.ceil(length / self._batch_size))
        else:
-            return _chunker(arr)
+            yield from _chunker(arr)
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -55,6 +55,27 @@ class LanceTable:
        """Return the schema of the table."""
        return self._dataset.schema

+    def __len__(self):
+        return self._dataset.count_rows()
+
+    def __repr__(self) -> str:
+        return f"LanceTable({self.name})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def head(self, n=5) -> pa.Table:
+        """Return the first n rows of the table."""
+        return self._dataset.head(n)
+
+    def to_pandas(self) -> pd.DataFrame:
+        """Return the table as a pandas DataFrame."""
+        return self.to_arrow().to_pandas()
+
+    def to_arrow(self) -> pa.Table:
+        """Return the table as a pyarrow Table."""
+        return self._dataset.to_table()
+
    @property
    def _dataset_uri(self) -> str:
        return os.path.join(self._conn.uri, f"{self.name}.lance")
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "lancedb"
-version = "0.0.2"
+version = "0.0.3"
 dependencies = ["pylance", "ratelimiter", "retry", "tqdm"]
 description = "lancedb"
 authors = [
--- a/python/tests/test_embeddings.py
+++ b/python/tests/test_embeddings.py
@@ -0,0 +1,42 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import sys
+
+import numpy as np
+import pyarrow as pa
+
+from lancedb.embeddings import with_embeddings
+
+
+def mock_embed_func(input_data):
+    return [np.random.randn(128).tolist() for _ in range(len(input_data))]
+
+
+def test_with_embeddings():
+    for wrap_api in [True, False]:
+        if wrap_api and sys.version_info.minor >= 11:
+            # ratelimiter package doesn't work on 3.11
+            continue
+        data = pa.Table.from_arrays(
+            [
+                pa.array(["foo", "bar"]),
+                pa.array([10.0, 20.0]),
+            ],
+            names=["text", "price"],
+        )
+        data = with_embeddings(mock_embed_func, data, wrap_api=wrap_api)
+        assert data.num_columns == 3
+        assert data.num_rows == 2
+        assert data.column_names == ["text", "price", "vector"]
+        assert data.column("text").to_pylist() == ["foo", "bar"]
+        assert data.column("price").to_pylist() == [10.0, 20.0]
Author	SHA1	Message	Date
Chang She	c32b6880e7	bump version for v0.0.3	2023-03-30 19:19:29 -07:00
Chang She	c6fe5e38f1	Merge pull request #16 from lancedb/jaichopra/readme-lance-benchmarking add Lance benchmarking blog	2023-03-30 19:04:44 -07:00
Jai	1c8b52f07b	add Lance benchmarking blog	2023-03-29 11:44:36 -07:00
Chang She	f544c5dd31	Merge pull request #14 from lancedb/changhiskhan/updates update for release	2023-03-24 19:50:35 -07:00
Chang She	eba533da4f	fix 3.11	2023-03-24 19:45:46 -07:00
Chang She	404211d4fb	fix 3.11	2023-03-24 19:00:22 -07:00
Chang She	5d7832c8a5	update for release	2023-03-24 18:16:29 -07:00
Jai	4eba83fdc9	Merge pull request #13 from lancedb/jaichopra/add-tutorials-readme Update README.md	2023-03-24 12:17:32 -07:00
Jai	6906a5f912	Update README.md	2023-03-24 12:16:44 -07:00
Jai	69fd80e9f2	Update README.md	2023-03-24 12:09:16 -07:00