From b9c532326538a032ed3197f76ba1764bf2c68751 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 28 Jan 2024 14:30:30 -0800 Subject: [PATCH] doc: use snippet for rust code example and make sure rust examples run through CI (#885) --- .github/workflows/rust.yml | 2 + docs/mkdocs.yml | 1 + docs/src/ann_indexes.md | 28 ++++------ docs/src/basic.md | 96 +++++++++++--------------------- docs/src/basic.rs | 1 - docs/src/search.md | 6 +- docs/src/sql.md | 6 +- rust/vectordb/examples/simple.rs | 92 ++++++++++++++++++++++++++++-- 8 files changed, 141 insertions(+), 91 deletions(-) delete mode 120000 docs/src/basic.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5ae150b3..21db91c6 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -70,6 +70,8 @@ jobs: run: cargo build --all-features - name: Run tests run: cargo test --all-features + - name: Run examples + run: cargo run --example simple macos: timeout-minutes: 30 strategy: diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 32b2408e..926ca9d1 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -68,6 +68,7 @@ markdown_extensions: pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.snippets: + base_path: .. dedent_subsections: true - pymdownx.superfences - pymdownx.tabbed: diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index bc745b75..5460e012 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -47,12 +47,12 @@ Lance supports `IVF_PQ` index type by default. tbl.create_index(num_partitions=256, num_sub_vectors=96) ``` -=== "Javascript" +=== "Typescript" - ```javascript - --8<--- "src/ann_indexes.ts:import" + ```typescript + --8<--- "docs/src/ann_indexes.ts:import" - --8<-- "src/ann_indexes.ts:ingest" + --8<-- "docs/src/ann_indexes.ts:ingest" ``` - **metric** (default: "L2"): The distance metric to use. By default it uses euclidean distance "`L2`". @@ -137,10 +137,10 @@ There are a couple of parameters that can be used to fine-tune the search: 1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867 ``` -=== "Javascript" +=== "Typescript" - ```javascript - --8<-- "src/ann_indexes.ts:search1" + ```typescript + --8<-- "docs/src/ann_indexes.ts:search1" ``` The search will return the data requested in addition to the distance of each item. @@ -155,10 +155,10 @@ You can further filter the elements returned by a search using a where clause. tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas() ``` -=== "Javascript" +=== "Typescript" ```javascript - --8<-- "src/ann_indexes.ts:search2" + --8<-- "docs/src/ann_indexes.ts:search2" ``` ### Projections (select clause) @@ -179,10 +179,10 @@ You can select the columns returned by the query using a select clause. ... ``` -=== "Javascript" +=== "Typescript" - ```javascript - --8<-- "src/ann_indexes.ts:search3" + ```typescript + --8<-- "docs/src/ann_indexes.ts:search3" ``` ## FAQ @@ -212,7 +212,3 @@ On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency. - -``` - -``` diff --git a/docs/src/basic.md b/docs/src/basic.md index 5cca4f7e..e8032a21 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -16,7 +16,7 @@ pip install lancedb ``` -=== "Javascript" +=== "Typescript" ```shell npm install vectordb @@ -24,6 +24,8 @@ === "Rust" + !!! warning "Rust SDK is experimental, might introduce breaking changes in the near future" + ```shell cargo add vectordb ``` @@ -54,12 +56,12 @@ db = lancedb.connect(uri) ``` -=== "Javascript" +=== "Typescript" ```typescript - --8<-- "src/basic_legacy.ts:import" + --8<-- "docs/src/basic_legacy.ts:import" - --8<-- "src/basic_legacy.ts:open_db" + --8<-- "docs/src/basic_legacy.ts:open_db" ``` === "Rust" @@ -67,7 +69,7 @@ ```rust #[tokio::main] async fn main() -> Result<()> { - --8<-- "src/basic.rs:connect" + --8<-- "rust/vectordb/examples/simple.rs:connect" } ``` @@ -100,10 +102,10 @@ If you need a reminder of the uri, you can call `db.uri()`. tbl = db.create_table("table_from_df", data=df) ``` -=== "Javascript" +=== "Typescript" - ```javascript - --8<-- "src/basic_legacy.ts:create_table" + ```typescript + --8<-- "docs/src/basic_legacy.ts:create_table" ``` If the table already exists, LanceDB will raise an error by default. @@ -116,22 +118,7 @@ If you need a reminder of the uri, you can call `db.uri()`. use arrow_schema::{DataType, Schema, Field}; use arrow_array::{RecordBatch, RecordBatchIterator}; - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("vector", DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Float32, true)), 128), true), - ])); - // Create a RecordBatch stream. - let batches = RecordBatchIterator::new(vec![ - RecordBatch::try*new(schema.clone(), - vec![ - Arc::new(Int32Array::from_iter_values(0..10)), - Arc::new(FixedSizeListArray::from_iter_primitive::( - (0..10).map(|_| Some(vec![Some(1.0); 128])), 128)), - ]).unwrap() - ].into_iter().map(Ok), - schema.clone()); - db.create_table("my_table", Box::new(batches), None).await.unwrap(); + --8<-- "rust/vectordb/examples/simple.rs:create_table" ``` If the table already exists, LanceDB will raise an error by default. @@ -151,24 +138,16 @@ In this case, you can create an empty table and specify the schema. tbl = db.create_table("empty_table", schema=schema) ``` -=== "Javascript" +=== "Typescript" ```typescript - --8<-- "src/basic_legacy.ts:create_empty_table" + --8<-- "docs/src/basic_legacy.ts:create_empty_table" ``` === "Rust" ```rust - use arrow_schema::{Schema, Field, DataType}; - use arrow_array::{RecordBatch, RecordBatchIterator}; - - let schema = Arc::new(Schema::new(vec![ - Field::new("vector", DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Float32, true)), 2), true), - ])); - let batches = RecordBatchIterator::new(vec![].into_iter().map(Ok), schema); - db.create_table("empty_table", Box::new(batches), None).await.unwrap(); + --8<-- "rust/vectordb/examples/simple.rs:create_empty_table" ``` ## How to open an existing table @@ -181,7 +160,7 @@ Once created, you can open a table using the following code: tbl = db.open_table("my_table") ``` -=== "Javascript" +=== "Typescript" ```typescript const tbl = await db.openTable("myTable"); @@ -190,7 +169,7 @@ Once created, you can open a table using the following code: === "Rust" ```rust - const tbl = db.open_table_with_params("myTable", None).await.unwrap(); + --8<-- "rust/vectordb/examples/simple.rs:open_with_existing_file" ``` If you forget the name of your table, you can always get a listing of all table names: @@ -210,7 +189,7 @@ If you forget the name of your table, you can always get a listing of all table === "Rust" ```rust - println!("{:?}", db.table_names().await.unwrap()); + --8<-- "rust/vectordb/examples/simple.rs:list_names" ``` ## How to add data to a table @@ -231,9 +210,9 @@ After a table has been created, you can always add more data to it using tbl.add(data) ``` -=== "Javascript" +=== "Typescript" - ```javascript + ```typescript await tbl.add([{vector: [1.3, 1.4], item: "fizz", price: 100.0}, {vector: [9.5, 56.2], item: "buzz", price: 200.0}]) ``` @@ -241,8 +220,7 @@ After a table has been created, you can always add more data to it using === "Rust" ```rust - let batches = RecordBatchIterator::new(...); - tbl.add(Box::new(batches), None).await.unwrap(); + --8<-- "rust/vectordb/examples/simple.rs:add" ``` ## How to search for (approximate) nearest neighbors @@ -257,24 +235,18 @@ Once you've embedded the query, you can find its nearest neighbors using the fol This returns a pandas DataFrame with the results. -=== "Javascript" +=== "Typescript" - ```javascript - --8<-- "src/basic_legacy.ts:search" + ```typescript + --8<-- "docs/src/basic_legacy.ts:search" ``` === "Rust" ```rust - use arrow_array::RecordBatch; use futures::TryStreamExt; - let results: Vec = tbl - .search(&[100.0, 100.0]) - .execute_stream() - .await - .unwrap() - .try_collect(); + --8<-- "rust/vectordb/examples/simple.rs:search" ``` By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). @@ -286,7 +258,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s tbl.create_index() ``` -=== "Javascript" +=== "Typescript" ```{.typescript .ignore} await tbl.createIndex({}) @@ -295,7 +267,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s === "Rust" ```rust - tbl.create_index(&["vector"]).build().await.unwrap() + --8<-- "rust/vectordb/examples/simple.rs:create_index" ``` Check [Approximate Nearest Neighbor (ANN) Indexes](/ann_indices.md) section for more details. @@ -312,16 +284,16 @@ This can delete any number of rows that match the filter. tbl.delete('item = "fizz"') ``` -=== "Javascript" +=== "Typescript" - ```javascript - --8<-- "src/basic_legacy.ts:delete" + ```typescript + --8<-- "docs/src/basic_legacy.ts:delete" ``` === "Rust" ```rust - tbl.delete("item = \"fizz\"").await.unwrap(); + --8<-- "rust/vectordb/examples/simple.rs:delete" ``` The deletion predicate is a SQL expression that supports the same expressions @@ -350,10 +322,10 @@ Use the `drop_table()` method on the database to remove a table. By default, if the table does not exist an exception is raised. To suppress this, you can pass in `ignore_missing=True`. -=== "JavaScript" +=== "Typescript" - ```javascript - --8<-- "src/basic_legacy.ts:drop_table" + ```typescript + --8<-- "docs/src/basic_legacy.ts:drop_table" ``` This permanently removes the table and is not recoverable, unlike deleting rows. @@ -362,7 +334,7 @@ Use the `drop_table()` method on the database to remove a table. === "Rust" ```rust - db.drop_table("my_table").await.unwrap() + --8<-- "rust/vectordb/examples/simple.rs:drop_table" ``` !!! note "Bundling `vectordb` apps with Webpack" diff --git a/docs/src/basic.rs b/docs/src/basic.rs deleted file mode 120000 index 8899e302..00000000 --- a/docs/src/basic.rs +++ /dev/null @@ -1 +0,0 @@ -../../rust/vectordb/examples/simple.rs \ No newline at end of file diff --git a/docs/src/search.md b/docs/src/search.md index ac4613f1..911fc668 100644 --- a/docs/src/search.md +++ b/docs/src/search.md @@ -56,9 +56,9 @@ db.create_table("my_vectors", data=data) === "JavaScript" ```javascript - --8<-- "src/search_legacy.ts:import" + --8<-- "docs/src/search_legacy.ts:import" - --8<-- "src/search_legancy.ts:search1" + --8<-- "docs/src/search_legacy.ts:search1" ``` By default, `l2` will be used as metric type. You can specify the metric type as @@ -76,7 +76,7 @@ By default, `l2` will be used as metric type. You can specify the metric type as === "JavaScript" ```javascript - --8<-- "src/search_legacy.ts:search2" + --8<-- "docs/src/search_legacy.ts:search2" ``` ## Approximate nearest neighbor (ANN) search diff --git a/docs/src/sql.md b/docs/src/sql.md index c8847529..3dd0b227 100644 --- a/docs/src/sql.md +++ b/docs/src/sql.md @@ -47,7 +47,7 @@ const tbl = await db.createTable('myVectors', data) === "JavaScript" ```javascript - --8<-- "src/sql_legacy.ts:search" + --8<-- "docs/src/sql_legacy.ts:search" ``` ## SQL filters @@ -80,7 +80,7 @@ For example, the following filter string is acceptable: === "Javascript" ```javascript - --8<-- "src/sql_legacy.ts:vec_search" + --8<-- "docs/src/sql_legacy.ts:vec_search" ``` If your column name contains special characters or is a [SQL Keyword](https://docs.rs/sqlparser/latest/sqlparser/keywords/index.html), @@ -150,7 +150,7 @@ You can also filter your data without search. === "JavaScript" ```javascript - --8<---- "src/sql_legacy.ts:sql_search" + --8<---- "docs/src/sql_legacy.ts:sql_search" ``` !!!warning "If your table is large, this could potentially return a very large amount of data. Please be sure to use a `limit` clause unless you're sure you want to return the whole result set." diff --git a/rust/vectordb/examples/simple.rs b/rust/vectordb/examples/simple.rs index 9f37bb55..947c6952 100644 --- a/rust/vectordb/examples/simple.rs +++ b/rust/vectordb/examples/simple.rs @@ -24,28 +24,64 @@ use vectordb::{connect, Result, Table, TableRef}; #[tokio::main] async fn main() -> Result<()> { + if std::path::Path::new("data").exists() { + std::fs::remove_dir_all("data").unwrap(); + } // --8<-- [start:connect] let uri = "data/sample-lancedb"; let db = connect(uri).await?; // --8<-- [end:connect] - let tbl = create_table(db).await?; + + // --8<-- [start:list_names] + println!("{:?}", db.table_names().await?); + // --8<-- [end:list_names] + let tbl = create_table(db.clone()).await?; create_index(tbl.as_ref()).await?; let batches = search(tbl.as_ref()).await?; println!("{:?}", batches); + + create_empty_table(db.clone()).await.unwrap(); + + // --8<-- [start:delete] + tbl.delete("id > 24").await.unwrap(); + // --8<-- [end:delete] + + // --8<-- [start:drop_table] + db.drop_table("my_table").await.unwrap(); + // --8<-- [end:drop_table] + Ok(()) +} + +#[allow(dead_code)] +async fn open_with_existing_tbl() -> Result<()> { + let uri = "data/sample-lancedb"; + let db = connect(uri).await?; + // --8<-- [start:open_with_existing_file] + let _ = db + .open_table_with_params("my_table", Default::default()) + .await + .unwrap(); + // --8<-- [end:open_with_existing_file] Ok(()) } async fn create_table(db: Arc) -> Result { + // --8<-- [start:create_table] + const TOTAL: usize = 1000; + const DIM: usize = 128; + let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new( "vector", - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + DIM as i32, + ), true, ), ])); - const TOTAL: usize = 1000; - const DIM: usize = 128; + // Create a RecordBatch stream. let batches = RecordBatchIterator::new( vec![RecordBatch::try_new( @@ -65,19 +101,62 @@ async fn create_table(db: Arc) -> Result { .map(Ok), schema.clone(), ); - db.create_table("my_table", Box::new(batches), None).await + let tbl = db + .create_table("my_table", Box::new(batches), None) + .await + .unwrap(); + // --8<-- [end:create_table] + + let new_batches = RecordBatchIterator::new( + vec![RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..TOTAL as i32)), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + (0..TOTAL).map(|_| Some(vec![Some(1.0); DIM])), + DIM as i32, + ), + ), + ], + ) + .unwrap()] + .into_iter() + .map(Ok), + schema.clone(), + ); + // --8<-- [start:add] + tbl.add(Box::new(new_batches), None).await.unwrap(); + // --8<-- [end:add] + + Ok(tbl) +} + +async fn create_empty_table(db: Arc) -> Result { + // --8<-- [start:create_empty_table] + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("item", DataType::Utf8, true), + ])); + let batches = RecordBatchIterator::new(vec![], schema.clone()); + db.create_table("empty_table", Box::new(batches), None) + .await + // --8<-- [end:create_empty_table] } async fn create_index(table: &dyn Table) -> Result<()> { + // --8<-- [start:create_index] table .create_index(&["vector"]) .ivf_pq() - .num_partitions(2) + .num_partitions(8) .build() .await + // --8<-- [end:create_index] } async fn search(table: &dyn Table) -> Result> { + // --8<-- [start:search] Ok(table .search(&[1.0; 128]) .limit(2) @@ -85,4 +164,5 @@ async fn search(table: &dyn Table) -> Result> { .await? .try_collect::>() .await?) + // --8<-- [end:search] }