diff --git a/docs/src/basic_legacy.ts b/docs/src/basic_legacy.ts index 7f6f8984..560f7254 100644 --- a/docs/src/basic_legacy.ts +++ b/docs/src/basic_legacy.ts @@ -1,6 +1,6 @@ // --8<-- [start:import] import * as lancedb from "vectordb"; -import { Schema, Field, Float32, FixedSizeList, Int32 } from "apache-arrow"; +import { Schema, Field, Float32, FixedSizeList, Int32, Float16 } from "apache-arrow"; // --8<-- [end:import] import * as fs from "fs"; import { Table as ArrowTable, Utf8 } from "apache-arrow"; @@ -8,6 +8,7 @@ import { Table as ArrowTable, Utf8 } from "apache-arrow"; const example = async () => { fs.rmSync("data/sample-lancedb", { recursive: true, force: true }); // --8<-- [start:open_db] + const lancedb = require("vectordb"); const uri = "data/sample-lancedb"; const db = await lancedb.connect(uri); // --8<-- [end:open_db] @@ -48,6 +49,27 @@ const example = async () => { const empty_tbl = await db.createTable({ name: "empty_table", schema }); // --8<-- [end:create_empty_table] + // --8<-- [start:create_f16_table] + const dim = 16 + const total = 10 + const f16_schema = new Schema([ + new Field('id', new Int32()), + new Field( + 'vector', + new FixedSizeList(dim, new Field('item', new Float16(), true)), + false + ) + ]) + const data = lancedb.makeArrowTable( + Array.from(Array(total), (_, i) => ({ + id: i, + vector: Array.from(Array(dim), Math.random) + })), + { f16_schema } + ) + const table = await db.createTable('f16_tbl', data) + // --8<-- [end:create_f16_table] + // --8<-- [start:search] const query = await tbl.search([100, 100]).limit(2).execute(); // --8<-- [end:search] diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md index ede6aee5..12510941 100644 --- a/docs/src/guides/tables.md +++ b/docs/src/guides/tables.md @@ -16,9 +16,22 @@ This guide will show how to create tables, insert data into them, and update the db = lancedb.connect("./.lancedb") ``` +=== "Javascript" + + Initialize a VectorDB connection and create a table using one of the many methods listed below. + + ```javascript + const lancedb = require("vectordb"); + + const uri = "data/sample-lancedb"; + const db = await lancedb.connect(uri); + ``` + LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these. - ### From list of tuples or dictionaries +### From list of tuples or dictionaries + +=== "Python" ```python import lancedb @@ -32,7 +45,6 @@ This guide will show how to create tables, insert data into them, and update the db["my_table"].head() ``` - !!! info "Note" If the table already exists, LanceDB will raise an error by default. @@ -51,7 +63,28 @@ This guide will show how to create tables, insert data into them, and update the db.create_table("name", data, mode="overwrite") ``` - ### From a Pandas DataFrame +=== "Javascript" + You can create a LanceDB table in JavaScript using an array of JSON records as follows. + + ```javascript + const tb = await db.createTable("my_table", [{ + "vector": [3.1, 4.1], + "item": "foo", + "price": 10.0 + }, { + "vector": [5.9, 26.5], + "item": "bar", + "price": 20.0 + }]); + ``` + !!! info "Note" + If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you need to specify the `WriteMode` in the createTable function. + + ```javascript + const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite }) + ``` + +### From a Pandas DataFrame ```python import pandas as pd @@ -79,7 +112,7 @@ This guide will show how to create tables, insert data into them, and update the table = db.create_table("my_table", data, schema=custom_schema) ``` - ### From a Polars DataFrame +### From a Polars DataFrame LanceDB supports [Polars](https://pola.rs/), a modern, fast DataFrame library written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow @@ -97,26 +130,44 @@ This guide will show how to create tables, insert data into them, and update the table = db.create_table("pl_table", data=data) ``` - ### From PyArrow Tables - You can also create LanceDB tables directly from PyArrow tables +### From an Arrow Table +=== "Python" + You can also create LanceDB tables directly from Arrow tables. + LanceDB supports float16 data type! ```python - table = pa.Table.from_arrays( - [ - pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]], - pa.list_(pa.float32(), 4)), - pa.array(["foo", "bar"]), - pa.array([10.0, 20.0]), - ], - ["vector", "item", "price"], - ) + import pyarrows as pa + import numpy as np + + dim = 16 + total = 2 + schema = pa.schema( + [ + pa.field("vector", pa.list_(pa.float16(), dim)), + pa.field("text", pa.string()) + ] + ) + data = pa.Table.from_arrays( + [ + pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)], + pa.list_(pa.float16(), dim)), + pa.array(["foo", "bar"]) + ], + ["vector", "text"], + ) + tbl = db.create_table("f16_tbl", data, schema=schema) + ``` - db = lancedb.connect("db") +=== "Javascript" + You can also create LanceDB tables directly from Arrow tables. + LanceDB supports Float16 data type! - tbl = db.create_table("my_table", table) + ```javascript + --8<-- "docs/src/basic_legacy.ts:create_f16_table" ``` ### From Pydantic Models + When you create an empty table without data, you must specify the table schema. LanceDB supports creating tables by specifying a PyArrow schema or a specialized Pydantic model called `LanceModel`. @@ -261,37 +312,6 @@ This guide will show how to create tables, insert data into them, and update the You can also use iterators of other types like Pandas DataFrame or Pylists directly in the above example. -=== "JavaScript" - Initialize a VectorDB connection and create a table using one of the many methods listed below. - - ```javascript - const lancedb = require("vectordb"); - - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - ``` - - You can create a LanceDB table in JavaScript using an array of JSON records as follows. - - ```javascript - const tb = await db.createTable("my_table", [{ - "vector": [3.1, 4.1], - "item": "foo", - "price": 10.0 - }, { - "vector": [5.9, 26.5], - "item": "bar", - "price": 20.0 - }]); - ``` - - !!! info "Note" - If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you need to specify the `WriteMode` in the createTable function. - - ```javascript - const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite }) - ``` - ## Open existing tables === "Python" diff --git a/docs/src/notebooks/tables_guide.ipynb b/docs/src/notebooks/tables_guide.ipynb index d7c46043..4f3bbec1 100644 --- a/docs/src/notebooks/tables_guide.ipynb +++ b/docs/src/notebooks/tables_guide.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 2, "id": "c1b4e34b-a49c-471d-a343-a5940bb5138a", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "4e5a8d07-d9a1-48c1-913a-8e0629289579", "metadata": {}, "outputs": [], @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "5df12f66-8d99-43ad-8d0b-22189ec0a6b9", "metadata": {}, "outputs": [ @@ -62,7 +62,7 @@ "long: [[-122.7,-74.1]]" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "f4d87ae9-0ccb-48eb-b31d-bb8f2370e47e", "metadata": {}, "outputs": [ @@ -108,7 +108,7 @@ "long: [[-122.7,-74.1]]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -135,10 +135,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "25f34bcf-fca0-4431-8601-eac95d1bd347", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-01-31T18:59:33Z WARN lance::dataset] No existing dataset at /Users/qian/Work/LanceDB/lancedb/docs/src/notebooks/.lancedb/table3.lance, it will be created\n" + ] + }, { "data": { "text/plain": [ @@ -148,7 +155,7 @@ "long: float" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -171,45 +178,51 @@ "id": "4df51925-7ca2-4005-9c72-38b3d26240c6", "metadata": {}, "source": [ - "### From PyArrow Tables\n", + "### From an Arrow Table\n", "\n", - "You can also create LanceDB tables directly from pyarrow tables" + "You can also create LanceDB tables directly from pyarrow tables. LanceDB supports float16 type." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "id": "90a880f6-be43-4c9d-ba65-0b05197c0f6f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "vector: fixed_size_list[2]\n", - " child 0, item: float\n", - "item: string\n", - "price: double" + "vector: fixed_size_list[16]\n", + " child 0, item: halffloat\n", + "text: string" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "table = pa.Table.from_arrays(\n", - " [\n", - " pa.array([[3.1, 4.1], [5.9, 26.5]],\n", - " pa.list_(pa.float32(), 2)),\n", - " pa.array([\"foo\", \"bar\"]),\n", - " pa.array([10.0, 20.0]),\n", - " ],\n", - " [\"vector\", \"item\", \"price\"],\n", - " )\n", + "import numpy as np\n", "\n", - "db = lancedb.connect(\"db\")\n", + "dim = 16\n", + "total = 2\n", + "schema = pa.schema(\n", + " [\n", + " pa.field(\"vector\", pa.list_(pa.float16(), dim)),\n", + " pa.field(\"text\", pa.string())\n", + " ]\n", + ")\n", + "data = pa.Table.from_arrays(\n", + " [\n", + " pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)],\n", + " pa.list_(pa.float16(), dim)),\n", + " pa.array([\"foo\", \"bar\"])\n", + " ],\n", + " [\"vector\", \"text\"],\n", + ")\n", "\n", - "tbl = db.create_table(\"test1\", table, mode=\"overwrite\")\n", + "tbl = db.create_table(\"f16_tbl\", data, schema=schema)\n", "tbl.schema" ] }, @@ -225,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "d81121d7-e4b7-447c-a48c-974b6ebb464a", "metadata": {}, "outputs": [ @@ -240,7 +253,7 @@ "imdb_id: int64 not null" ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -282,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "id": "bc247142-4e3c-41a2-b94c-8e00d2c2a508", "metadata": {}, "outputs": [ @@ -292,7 +305,7 @@ "LanceTable(table4)" ] }, - "execution_count": 14, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -333,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "25ad3523-e0c9-4c28-b3df-38189c4e0e5f", "metadata": {}, "outputs": [ @@ -346,7 +359,7 @@ "price: double not null" ] }, - "execution_count": 16, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -385,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "id": "2814173a-eacc-4dd8-a64d-6312b44582cc", "metadata": {}, "outputs": [], @@ -411,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "id": "df9e13c0-41f6-437f-9dfa-2fd71d3d9c45", "metadata": {}, "outputs": [ @@ -421,7 +434,7 @@ "['table6', 'table4', 'table5', 'movielens_small']" ] }, - "execution_count": 18, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "id": "9343f5ad-6024-42ee-ac2f-6c1471df8679", "metadata": {}, "outputs": [ @@ -541,7 +554,7 @@ "9 [5.9, 26.5] bar 20.0" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -564,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "id": "8a56250f-73a1-4c26-a6ad-5c7a0ce3a9ab", "metadata": {}, "outputs": [], @@ -590,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "id": "030c7057-b98e-4e2f-be14-b8c1f927f83c", "metadata": {}, "outputs": [], @@ -621,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "id": "e7a17de2-08d2-41b7-bd05-f63d1045ab1f", "metadata": {}, "outputs": [ @@ -629,16 +642,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "32\n" + "22\n" ] }, { "data": { "text/plain": [ - "17" + "12" ] }, - "execution_count": 24, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -661,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 17, "id": "fe3310bd-08f4-4a22-a63b-b3127d22f9f7", "metadata": {}, "outputs": [ @@ -681,25 +694,20 @@ "8 [3.1, 4.1] foo 10.0\n", "9 [3.1, 4.1] foo 10.0\n", "10 [3.1, 4.1] foo 10.0\n", - "11 [3.1, 4.1] foo 10.0\n", - "12 [3.1, 4.1] foo 10.0\n", - "13 [3.1, 4.1] foo 10.0\n", - "14 [3.1, 4.1] foo 10.0\n", - "15 [3.1, 4.1] foo 10.0\n", - "16 [3.1, 4.1] foo 10.0\n" + "11 [3.1, 4.1] foo 10.0\n" ] }, { "ename": "OSError", - "evalue": "LanceError(IO): Error during planning: column foo does not exist", + "evalue": "LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m tbl\u001b[38;5;241m.\u001b[39mto_pandas()\n", - "File \u001b[0;32m~/Documents/lancedb/lancedb/python/lancedb/table.py:610\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 610\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Documents/lancedb/lancedb/env/lib/python3.11/site-packages/lance/dataset.py:489\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 488\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist" + "Cell \u001b[0;32mIn[17], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lancedb/table.py:872\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 872\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lance/dataset.py:596\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 595\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 596\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23" ] } ], @@ -712,7 +720,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "87d5bc21-847f-4c81-b56e-f6dbe5d05aac", "metadata": {}, "outputs": [], @@ -729,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "9cba4519-eb3a-4941-ab7e-873d762e750f", "metadata": {}, "outputs": [], @@ -742,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "5bdc9801-d5ed-4871-92d0-88b27108e788", "metadata": {}, "outputs": [ @@ -817,7 +825,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.7" } }, "nbformat": 4,