From fd2fd948624aaef953d3bae8b93070439b115492 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Fri, 26 Jan 2024 16:19:43 -0800 Subject: [PATCH] doc: update quick start for full rust example (#872) --- docs/mkdocs.yml | 1 + docs/src/basic.md | 266 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 216 insertions(+), 51 deletions(-) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index fc977983..65aba135 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -129,6 +129,7 @@ nav: - ⚙️ API reference: - 🐍 Python: python/python.md - 👾 JavaScript: javascript/modules.md + - 🦀 Rust: https://docs.rs/vectordb/latest/vectordb/ - ☁️ LanceDB Cloud: - Overview: cloud/index.md - API reference: diff --git a/docs/src/basic.md b/docs/src/basic.md index bd30ed99..44ac128f 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -11,43 +11,77 @@ ## Installation === "Python" + ```shell pip install lancedb ``` === "Javascript" + ```shell npm install vectordb ``` +=== "Rust" + + ```shell + cargo install vectordb + ``` + + !!! info "Rust crate is installed as source. You need to install protobuf." + + === "macOS" + + ```shell + brew install protobuf + ``` + + === "Ubuntu/Debian" + + ```shell + sudo apt install -y protobuf-compiler libssl-dev + ``` + + ## How to connect to a database === "Python" + ```python import lancedb uri = "data/sample-lancedb" db = lancedb.connect(uri) ``` - LanceDB will create the directory if it doesn't exist (including parent directories). - - If you need a reminder of the uri, use the `db.uri` property. - === "Javascript" + ```javascript const lancedb = require("vectordb"); const uri = "data/sample-lancedb"; const db = await lancedb.connect(uri); ``` - - LanceDB will create the directory if it doesn't exist (including parent directories). - If you need a reminder of the uri, you can call `db.uri()`. +=== "Rust" + + ```rust + use vectordb::connect; + + #[tokio::main] + async fn main() -> Result<()> { + let uri = "data/sample-lancedb"; + let db = connect(&uri).await?; + } + ``` + +LanceDB will create the directory if it doesn't exist (including parent directories). + +If you need a reminder of the uri, you can call `db.uri()`. ## How to create a table === "Python" + ```python tbl = db.create_table("my_table", data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, @@ -59,6 +93,7 @@ to the `create_table` method. You can also pass in a pandas DataFrame directly: + ```python import pandas as pd df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, @@ -67,8 +102,9 @@ ``` === "Javascript" + ```javascript - const tb = await db.createTable( + const tbl = await db.createTable( "myTable", [{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}] @@ -79,6 +115,31 @@ If you want to overwrite the table, you can pass in `mode="overwrite"` to the `createTable` function. +=== "Rust" + + ```rust + use arrow_schema::{DataType, Schema, Field}; + use arrow_array::{RecordBatch, RecordBatchIterator}; + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("vector", DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), 128), true), + ])); + // Create a RecordBatch stream. + let batches = RecordBatchIterator::new(vec![ + RecordBatch::try*new(schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..10)), + Arc::new(FixedSizeListArray::from_iter_primitive::( + (0..10).map(|_| Some(vec![Some(1.0); 128])), 128)), + ]).unwrap() + ].into_iter().map(Ok), + schema.clone()); + db.create_table("my_table", Box::new(batches), None).await.unwrap(); + ``` + + If the table already exists, LanceDB will raise an error by default. !!! info "Under the hood, LanceDB is converting the input data into an Apache Arrow table and persisting it to disk in [Lance format](https://www.github.com/lancedb/lance)." @@ -88,76 +149,164 @@ Sometimes you may not have the data to insert into the table at creation time. In this case, you can create an empty table and specify the schema. === "Python" + ```python import pyarrow as pa schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))]) tbl = db.create_table("empty_table", schema=schema) ``` +=== "Javascript" + + ```typescript + import { Schema, Field, FixedSizeList, DataType } from "apache-arrow"; + + schema = new Schema([new new Field("vec", new FixedSizeList(2, new Field("item", new Float32())))]) + tbl = await db.createTable({ name: "empty_table", schema: schema }); + ``` + +=== "Rust" + + ```rust + use arrow_schema::{Schema, Field, DataType}; + use arrow_array::{RecordBatch, RecordBatchIterator}; + + let schema = Arc::new(Schema::new(vec![ + Field::new("vector", DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), 2), true), + ])); + let batches = RecordBatchIterator::new(vec![].into_iter().map(Ok), schema); + db.create_table("empty_table", Box::new(batches), None).await.unwrap(); + ``` + ## How to open an existing table Once created, you can open a table using the following code: === "Python" - ```python - tbl = db.open_table("my_table") - ``` - If you forget the name of your table, you can always get a listing of all table names: - - ```python - print(db.table_names()) - ``` + ```python + tbl = db.open_table("my_table") + ``` === "Javascript" - ```javascript - const tbl = await db.openTable("myTable"); - ``` - If you forget the name of your table, you can always get a listing of all table names: + ```typescript + const tbl = await db.openTable("myTable"); + ``` - ```javascript - console.log(await db.tableNames()); - ``` +=== "Rust" + + ```rust + const tbl = db.open_table_with_params("myTable", None).await.unwrap(); + ``` + +If you forget the name of your table, you can always get a listing of all table names: + +=== "Python" + + ```python + print(db.table_names()) + ``` + +=== "Javascript" + + ```javascript + console.log(await db.tableNames()); + ``` + +=== "Rust" + + ```rust + println!("{:?}", db.table_names().await.unwrap()); + ``` ## How to add data to a table After a table has been created, you can always add more data to it using === "Python" - ```python - # Option 1: Add a list of dicts to a table - data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0}, - {"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}] - tbl.add(data) + ```python - # Option 2: Add a pandas DataFrame to a table - df = pd.DataFrame(data) - tbl.add(data) - ``` + # Option 1: Add a list of dicts to a table + data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0}, + {"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}] + tbl.add(data) + + # Option 2: Add a pandas DataFrame to a table + df = pd.DataFrame(data) + tbl.add(data) + ``` === "Javascript" - ```javascript - await tbl.add([{vector: [1.3, 1.4], item: "fizz", price: 100.0}, - {vector: [9.5, 56.2], item: "buzz", price: 200.0}]) - ``` + + ```javascript + await tbl.add([{vector: [1.3, 1.4], item: "fizz", price: 100.0}, + {vector: [9.5, 56.2], item: "buzz", price: 200.0}]) + ``` + +=== "Rust" + + ```rust + let batches = RecordBatchIterator::new(...); + tbl.add(Box::new(batches), None).await.unwrap(); + ``` ## How to search for (approximate) nearest neighbors Once you've embedded the query, you can find its nearest neighbors using the following code: === "Python" - ```python - tbl.search([100, 100]).limit(2).to_pandas() - ``` - This returns a pandas DataFrame with the results. + ```python + tbl.search([100, 100]).limit(2).to_pandas() + ``` + + This returns a pandas DataFrame with the results. === "Javascript" - ```javascript - const query = await tbl.search([100, 100]).limit(2).execute(); - ``` + + ```javascript + const query = await tbl.search([100, 100]).limit(2).execute(); + ``` + +=== "Rust" + + ```rust + use arrow_array::RecordBatch; + use futures::TryStreamExt; + + let results: Vec = tbl + .search(&[100.0, 100.0]) + .execute_stream() + .await + .unwrap() + .try_collect(); + ``` + +By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). +users can speed up the query by creating vector indices over the vector columns. + +=== "Python" + + ```python + tbl.create_index() + ``` + +=== "Javascript" + + ```javascript + await tbl.createIndex({}) + ``` + +=== "Rust" + + ```rust + tbl.create_index(&["vector"]).build().await.unwrap() + ``` + +Check [Approximate Nearest Neighbor (ANN) Indexes](/ann_indices.md) section for more details. ## How to delete rows from a table @@ -166,20 +315,27 @@ which rows to delete, provide a filter that matches on the metadata columns. This can delete any number of rows that match the filter. === "Python" - ```python - tbl.delete('item = "fizz"') - ``` + + ```python + tbl.delete('item = "fizz"') + ``` === "Javascript" - ```javascript - await tbl.delete('item = "fizz"') - ``` + + ```javascript + await tbl.delete('item = "fizz"') + ``` + +=== "Rust" + + ```rust + tbl.delete("item = \"fizz\"").await.unwrap(); + ``` The deletion predicate is a SQL expression that supports the same expressions as the `where()` clause on a search. They can be as simple or complex as needed. To see what expressions are supported, see the [SQL filters](sql.md) section. - === "Python" Read more: [lancedb.table.Table.delete][] @@ -193,6 +349,7 @@ To see what expressions are supported, see the [SQL filters](sql.md) section. Use the `drop_table()` method on the database to remove a table. === "Python" + ```python db.drop_table("my_table") ``` @@ -202,12 +359,19 @@ Use the `drop_table()` method on the database to remove a table. you can pass in `ignore_missing=True`. === "JavaScript" + ```javascript await db.dropTable('myTable') ``` This permanently removes the table and is not recoverable, unlike deleting rows. - If the table does not exist an exception is raised. + If the table does not exist an exception is raised. + +=== "Rust" + + ```rust + db.drop_table("my_table").await.unwrap() + ``` !!! note "Bundling `vectordb` apps with Webpack"