From b9c532326538a032ed3197f76ba1764bf2c68751 Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Sun, 28 Jan 2024 14:30:30 -0800
Subject: [PATCH] doc: use snippet for rust code example and make sure rust
 examples run through CI (#885)

---
 .github/workflows/rust.yml       |  2 +
 docs/mkdocs.yml                  |  1 +
 docs/src/ann_indexes.md          | 28 ++++------
 docs/src/basic.md                | 96 +++++++++++---------------------
 docs/src/basic.rs                |  1 -
 docs/src/search.md               |  6 +-
 docs/src/sql.md                  |  6 +-
 rust/vectordb/examples/simple.rs | 92 ++++++++++++++++++++++++++++--
 8 files changed, 141 insertions(+), 91 deletions(-)
 delete mode 120000 docs/src/basic.rs

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 5ae150b3..21db91c6 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -70,6 +70,8 @@ jobs:
       run: cargo build --all-features
     - name: Run tests
       run: cargo test --all-features
+    - name: Run examples
+      run: cargo run --example simple
   macos:
     timeout-minutes: 30
     strategy:
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 32b2408e..926ca9d1 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -68,6 +68,7 @@ markdown_extensions:
     pygments_lang_class: true
 - pymdownx.inlinehilite
 - pymdownx.snippets:
+    base_path: ..
     dedent_subsections: true
 - pymdownx.superfences
 - pymdownx.tabbed:
diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md
index bc745b75..5460e012 100644
--- a/docs/src/ann_indexes.md
+++ b/docs/src/ann_indexes.md
@@ -47,12 +47,12 @@ Lance supports `IVF_PQ` index type by default.
      tbl.create_index(num_partitions=256, num_sub_vectors=96)
      ```
 
-=== "Javascript"
+=== "Typescript"
 
-     ```javascript
-     --8<--- "src/ann_indexes.ts:import"
+     ```typescript
+     --8<--- "docs/src/ann_indexes.ts:import"
 
-     --8<-- "src/ann_indexes.ts:ingest"
+     --8<-- "docs/src/ann_indexes.ts:ingest"
      ```
 
 - **metric** (default: "L2"): The distance metric to use. By default it uses euclidean distance "`L2`".
@@ -137,10 +137,10 @@ There are a couple of parameters that can be used to fine-tune the search:
      1  [0.48587373, 0.269207, 0.15095535, 0.65531915,...  item 3953  108.393867
      ```
 
-=== "Javascript"
+=== "Typescript"
 
-     ```javascript
-     --8<-- "src/ann_indexes.ts:search1"
+     ```typescript
+     --8<-- "docs/src/ann_indexes.ts:search1"
      ```
 
 The search will return the data requested in addition to the distance of each item.
@@ -155,10 +155,10 @@ You can further filter the elements returned by a search using a where clause.
      tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas()
      ```
 
-=== "Javascript"
+=== "Typescript"
 
      ```javascript
-     --8<-- "src/ann_indexes.ts:search2"
+     --8<-- "docs/src/ann_indexes.ts:search2"
      ```
 
 ### Projections (select clause)
@@ -179,10 +179,10 @@ You can select the columns returned by the query using a select clause.
      ...
      ```
 
-=== "Javascript"
+=== "Typescript"
 
-     ```javascript
-     --8<-- "src/ann_indexes.ts:search3"
+     ```typescript
+     --8<-- "docs/src/ann_indexes.ts:search3"
      ```
 
 ## FAQ
@@ -212,7 +212,3 @@ On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows
 PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
 less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
 more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
-
-```
-
-```
diff --git a/docs/src/basic.md b/docs/src/basic.md
index 5cca4f7e..e8032a21 100644
--- a/docs/src/basic.md
+++ b/docs/src/basic.md
@@ -16,7 +16,7 @@
       pip install lancedb
       ```
 
-=== "Javascript"
+=== "Typescript"
 
       ```shell
       npm install vectordb
@@ -24,6 +24,8 @@
 
 === "Rust"
 
+    !!! warning "Rust SDK is experimental, might introduce breaking changes in the near future"
+
     ```shell
     cargo add vectordb
     ```
@@ -54,12 +56,12 @@
       db = lancedb.connect(uri)
       ```
 
-=== "Javascript"
+=== "Typescript"
 
     ```typescript
-    --8<-- "src/basic_legacy.ts:import"
+    --8<-- "docs/src/basic_legacy.ts:import"
 
-    --8<-- "src/basic_legacy.ts:open_db"
+    --8<-- "docs/src/basic_legacy.ts:open_db"
     ```
 
 === "Rust"
@@ -67,7 +69,7 @@
     ```rust
     #[tokio::main]
     async fn main() -> Result<()> {
-        --8<-- "src/basic.rs:connect"
+        --8<-- "rust/vectordb/examples/simple.rs:connect"
     }
     ```
 
@@ -100,10 +102,10 @@ If you need a reminder of the uri, you can call `db.uri()`.
     tbl = db.create_table("table_from_df", data=df)
     ```
 
-=== "Javascript"
+=== "Typescript"
 
-    ```javascript
-    --8<-- "src/basic_legacy.ts:create_table"
+    ```typescript
+    --8<-- "docs/src/basic_legacy.ts:create_table"
     ```
 
     If the table already exists, LanceDB will raise an error by default.
@@ -116,22 +118,7 @@ If you need a reminder of the uri, you can call `db.uri()`.
     use arrow_schema::{DataType, Schema, Field};
     use arrow_array::{RecordBatch, RecordBatchIterator};
 
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("vector", DataType::FixedSizeList(
-            Arc::new(Field::new("item", DataType::Float32, true)), 128), true),
-    ]));
-    // Create a RecordBatch stream.
-    let batches = RecordBatchIterator::new(vec![
-        RecordBatch::try*new(schema.clone(),
-        vec![
-            Arc::new(Int32Array::from_iter_values(0..10)),
-            Arc::new(FixedSizeListArray::from_iter_primitive::<Float32Type, *, _>(
-                (0..10).map(|_| Some(vec![Some(1.0); 128])), 128)),
-            ]).unwrap()
-        ].into_iter().map(Ok),
-        schema.clone());
-    db.create_table("my_table", Box::new(batches), None).await.unwrap();
+    --8<-- "rust/vectordb/examples/simple.rs:create_table"
     ```
 
     If the table already exists, LanceDB will raise an error by default.
@@ -151,24 +138,16 @@ In this case, you can create an empty table and specify the schema.
       tbl = db.create_table("empty_table", schema=schema)
       ```
 
-=== "Javascript"
+=== "Typescript"
 
     ```typescript
-    --8<-- "src/basic_legacy.ts:create_empty_table"
+    --8<-- "docs/src/basic_legacy.ts:create_empty_table"
     ```
 
 === "Rust"
 
     ```rust
-    use arrow_schema::{Schema, Field, DataType};
-    use arrow_array::{RecordBatch, RecordBatchIterator};
-
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("vector", DataType::FixedSizeList(
-                Arc::new(Field::new("item", DataType::Float32, true)), 2), true),
-        ]));
-    let batches = RecordBatchIterator::new(vec![].into_iter().map(Ok), schema);
-    db.create_table("empty_table", Box::new(batches), None).await.unwrap();
+    --8<-- "rust/vectordb/examples/simple.rs:create_empty_table"
     ```
 
 ## How to open an existing table
@@ -181,7 +160,7 @@ Once created, you can open a table using the following code:
     tbl = db.open_table("my_table")
     ```
 
-=== "Javascript"
+=== "Typescript"
 
     ```typescript
     const tbl = await db.openTable("myTable");
@@ -190,7 +169,7 @@ Once created, you can open a table using the following code:
 === "Rust"
 
     ```rust
-    const tbl = db.open_table_with_params("myTable", None).await.unwrap();
+    --8<-- "rust/vectordb/examples/simple.rs:open_with_existing_file"
     ```
 
 If you forget the name of your table, you can always get a listing of all table names:
@@ -210,7 +189,7 @@ If you forget the name of your table, you can always get a listing of all table
 === "Rust"
 
     ```rust
-    println!("{:?}", db.table_names().await.unwrap());
+    --8<-- "rust/vectordb/examples/simple.rs:list_names"
     ```
 
 ## How to add data to a table
@@ -231,9 +210,9 @@ After a table has been created, you can always add more data to it using
     tbl.add(data)
     ```
 
-=== "Javascript"
+=== "Typescript"
 
-    ```javascript
+    ```typescript
     await tbl.add([{vector: [1.3, 1.4], item: "fizz", price: 100.0},
                     {vector: [9.5, 56.2], item: "buzz", price: 200.0}])
     ```
@@ -241,8 +220,7 @@ After a table has been created, you can always add more data to it using
 === "Rust"
 
     ```rust
-    let batches = RecordBatchIterator::new(...);
-    tbl.add(Box::new(batches), None).await.unwrap();
+    --8<-- "rust/vectordb/examples/simple.rs:add"
     ```
 
 ## How to search for (approximate) nearest neighbors
@@ -257,24 +235,18 @@ Once you've embedded the query, you can find its nearest neighbors using the fol
 
     This returns a pandas DataFrame with the results.
 
-=== "Javascript"
+=== "Typescript"
 
-    ```javascript
-    --8<-- "src/basic_legacy.ts:search"
+    ```typescript
+    --8<-- "docs/src/basic_legacy.ts:search"
     ```
 
 === "Rust"
 
     ```rust
-    use arrow_array::RecordBatch;
     use futures::TryStreamExt;
 
-    let results: Vec<RecordBatch> = tbl
-        .search(&[100.0, 100.0])
-        .execute_stream()
-        .await
-        .unwrap()
-        .try_collect();
+    --8<-- "rust/vectordb/examples/simple.rs:search"
     ```
 
 By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN).
@@ -286,7 +258,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s
     tbl.create_index()
     ```
 
-=== "Javascript"
+=== "Typescript"
 
     ```{.typescript .ignore}
     await tbl.createIndex({})
@@ -295,7 +267,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s
 === "Rust"
 
     ```rust
-    tbl.create_index(&["vector"]).build().await.unwrap()
+     --8<-- "rust/vectordb/examples/simple.rs:create_index"
     ```
 
 Check [Approximate Nearest Neighbor (ANN) Indexes](/ann_indices.md) section for more details.
@@ -312,16 +284,16 @@ This can delete any number of rows that match the filter.
     tbl.delete('item = "fizz"')
     ```
 
-=== "Javascript"
+=== "Typescript"
 
-    ```javascript
-    --8<-- "src/basic_legacy.ts:delete"
+    ```typescript
+    --8<-- "docs/src/basic_legacy.ts:delete"
     ```
 
 === "Rust"
 
     ```rust
-    tbl.delete("item = \"fizz\"").await.unwrap();
+    --8<-- "rust/vectordb/examples/simple.rs:delete"
     ```
 
 The deletion predicate is a SQL expression that supports the same expressions
@@ -350,10 +322,10 @@ Use the `drop_table()` method on the database to remove a table.
       By default, if the table does not exist an exception is raised. To suppress this,
       you can pass in `ignore_missing=True`.
 
-=== "JavaScript"
+=== "Typescript"
 
-      ```javascript
-      --8<-- "src/basic_legacy.ts:drop_table"
+      ```typescript
+      --8<-- "docs/src/basic_legacy.ts:drop_table"
       ```
 
       This permanently removes the table and is not recoverable, unlike deleting rows.
@@ -362,7 +334,7 @@ Use the `drop_table()` method on the database to remove a table.
 === "Rust"
 
     ```rust
-    db.drop_table("my_table").await.unwrap()
+    --8<-- "rust/vectordb/examples/simple.rs:drop_table"
     ```
 
 !!! note "Bundling `vectordb` apps with Webpack"
diff --git a/docs/src/basic.rs b/docs/src/basic.rs
deleted file mode 120000
index 8899e302..00000000
--- a/docs/src/basic.rs
+++ /dev/null
@@ -1 +0,0 @@
-../../rust/vectordb/examples/simple.rs
\ No newline at end of file
diff --git a/docs/src/search.md b/docs/src/search.md
index ac4613f1..911fc668 100644
--- a/docs/src/search.md
+++ b/docs/src/search.md
@@ -56,9 +56,9 @@ db.create_table("my_vectors", data=data)
 === "JavaScript"
 
     ```javascript
-    --8<-- "src/search_legacy.ts:import"
+    --8<-- "docs/src/search_legacy.ts:import"
 
-    --8<-- "src/search_legancy.ts:search1"
+    --8<-- "docs/src/search_legacy.ts:search1"
     ```
 
 By default, `l2` will be used as metric type. You can specify the metric type as
@@ -76,7 +76,7 @@ By default, `l2` will be used as metric type. You can specify the metric type as
 === "JavaScript"
 
     ```javascript
-    --8<-- "src/search_legacy.ts:search2"
+    --8<-- "docs/src/search_legacy.ts:search2"
     ```
 
 ## Approximate nearest neighbor (ANN) search
diff --git a/docs/src/sql.md b/docs/src/sql.md
index c8847529..3dd0b227 100644
--- a/docs/src/sql.md
+++ b/docs/src/sql.md
@@ -47,7 +47,7 @@ const tbl = await db.createTable('myVectors', data)
 === "JavaScript"
 
     ```javascript
-    --8<-- "src/sql_legacy.ts:search"
+    --8<-- "docs/src/sql_legacy.ts:search"
     ```
 
 ## SQL filters
@@ -80,7 +80,7 @@ For example, the following filter string is acceptable:
 === "Javascript"
 
     ```javascript
-    --8<-- "src/sql_legacy.ts:vec_search"
+    --8<-- "docs/src/sql_legacy.ts:vec_search"
     ```
 
 If your column name contains special characters or is a [SQL Keyword](https://docs.rs/sqlparser/latest/sqlparser/keywords/index.html),
@@ -150,7 +150,7 @@ You can also filter your data without search.
 === "JavaScript"
 
     ```javascript
-    --8<---- "src/sql_legacy.ts:sql_search"
+    --8<---- "docs/src/sql_legacy.ts:sql_search"
     ```
 
 !!!warning "If your table is large, this could potentially return a very large amount of data. Please be sure to use a `limit` clause unless you're sure you want to return the whole result set."
diff --git a/rust/vectordb/examples/simple.rs b/rust/vectordb/examples/simple.rs
index 9f37bb55..947c6952 100644
--- a/rust/vectordb/examples/simple.rs
+++ b/rust/vectordb/examples/simple.rs
@@ -24,28 +24,64 @@ use vectordb::{connect, Result, Table, TableRef};
 
 #[tokio::main]
 async fn main() -> Result<()> {
+    if std::path::Path::new("data").exists() {
+        std::fs::remove_dir_all("data").unwrap();
+    }
     // --8<-- [start:connect]
     let uri = "data/sample-lancedb";
     let db = connect(uri).await?;
     // --8<-- [end:connect]
-    let tbl = create_table(db).await?;
+
+    // --8<-- [start:list_names]
+    println!("{:?}", db.table_names().await?);
+    // --8<-- [end:list_names]
+    let tbl = create_table(db.clone()).await?;
     create_index(tbl.as_ref()).await?;
     let batches = search(tbl.as_ref()).await?;
     println!("{:?}", batches);
+
+    create_empty_table(db.clone()).await.unwrap();
+
+    // --8<-- [start:delete]
+    tbl.delete("id > 24").await.unwrap();
+    // --8<-- [end:delete]
+
+    // --8<-- [start:drop_table]
+    db.drop_table("my_table").await.unwrap();
+    // --8<-- [end:drop_table]
+    Ok(())
+}
+
+#[allow(dead_code)]
+async fn open_with_existing_tbl() -> Result<()> {
+    let uri = "data/sample-lancedb";
+    let db = connect(uri).await?;
+    // --8<-- [start:open_with_existing_file]
+    let _ = db
+        .open_table_with_params("my_table", Default::default())
+        .await
+        .unwrap();
+    // --8<-- [end:open_with_existing_file]
     Ok(())
 }
 
 async fn create_table(db: Arc<dyn Connection>) -> Result<TableRef> {
+    // --8<-- [start:create_table]
+    const TOTAL: usize = 1000;
+    const DIM: usize = 128;
+
     let schema = Arc::new(Schema::new(vec![
         Field::new("id", DataType::Int32, false),
         Field::new(
             "vector",
-            DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128),
+            DataType::FixedSizeList(
+                Arc::new(Field::new("item", DataType::Float32, true)),
+                DIM as i32,
+            ),
             true,
         ),
     ]));
-    const TOTAL: usize = 1000;
-    const DIM: usize = 128;
+
     // Create a RecordBatch stream.
     let batches = RecordBatchIterator::new(
         vec![RecordBatch::try_new(
@@ -65,19 +101,62 @@ async fn create_table(db: Arc<dyn Connection>) -> Result<TableRef> {
         .map(Ok),
         schema.clone(),
     );
-    db.create_table("my_table", Box::new(batches), None).await
+    let tbl = db
+        .create_table("my_table", Box::new(batches), None)
+        .await
+        .unwrap();
+    // --8<-- [end:create_table]
+
+    let new_batches = RecordBatchIterator::new(
+        vec![RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from_iter_values(0..TOTAL as i32)),
+                Arc::new(
+                    FixedSizeListArray::from_iter_primitive::<Float32Type, _, _>(
+                        (0..TOTAL).map(|_| Some(vec![Some(1.0); DIM])),
+                        DIM as i32,
+                    ),
+                ),
+            ],
+        )
+        .unwrap()]
+        .into_iter()
+        .map(Ok),
+        schema.clone(),
+    );
+    // --8<-- [start:add]
+    tbl.add(Box::new(new_batches), None).await.unwrap();
+    // --8<-- [end:add]
+
+    Ok(tbl)
+}
+
+async fn create_empty_table(db: Arc<dyn Connection>) -> Result<TableRef> {
+    // --8<-- [start:create_empty_table]
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("item", DataType::Utf8, true),
+    ]));
+    let batches = RecordBatchIterator::new(vec![], schema.clone());
+    db.create_table("empty_table", Box::new(batches), None)
+        .await
+    // --8<-- [end:create_empty_table]
 }
 
 async fn create_index(table: &dyn Table) -> Result<()> {
+    // --8<-- [start:create_index]
     table
         .create_index(&["vector"])
         .ivf_pq()
-        .num_partitions(2)
+        .num_partitions(8)
         .build()
         .await
+    // --8<-- [end:create_index]
 }
 
 async fn search(table: &dyn Table) -> Result<Vec<RecordBatch>> {
+    // --8<-- [start:search]
     Ok(table
         .search(&[1.0; 128])
         .limit(2)
@@ -85,4 +164,5 @@ async fn search(table: &dyn Table) -> Result<Vec<RecordBatch>> {
         .await?
         .try_collect::<Vec<_>>()
         .await?)
+    // --8<-- [end:search]
 }