mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 19:32:56 +00:00
doc: use snippet for rust code example and make sure rust examples run through CI (#885)
This commit is contained in:
2
.github/workflows/rust.yml
vendored
2
.github/workflows/rust.yml
vendored
@@ -70,6 +70,8 @@ jobs:
|
||||
run: cargo build --all-features
|
||||
- name: Run tests
|
||||
run: cargo test --all-features
|
||||
- name: Run examples
|
||||
run: cargo run --example simple
|
||||
macos:
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
|
||||
@@ -68,6 +68,7 @@ markdown_extensions:
|
||||
pygments_lang_class: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets:
|
||||
base_path: ..
|
||||
dedent_subsections: true
|
||||
- pymdownx.superfences
|
||||
- pymdownx.tabbed:
|
||||
|
||||
@@ -47,12 +47,12 @@ Lance supports `IVF_PQ` index type by default.
|
||||
tbl.create_index(num_partitions=256, num_sub_vectors=96)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<--- "src/ann_indexes.ts:import"
|
||||
```typescript
|
||||
--8<--- "docs/src/ann_indexes.ts:import"
|
||||
|
||||
--8<-- "src/ann_indexes.ts:ingest"
|
||||
--8<-- "docs/src/ann_indexes.ts:ingest"
|
||||
```
|
||||
|
||||
- **metric** (default: "L2"): The distance metric to use. By default it uses euclidean distance "`L2`".
|
||||
@@ -137,10 +137,10 @@ There are a couple of parameters that can be used to fine-tune the search:
|
||||
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/ann_indexes.ts:search1"
|
||||
```typescript
|
||||
--8<-- "docs/src/ann_indexes.ts:search1"
|
||||
```
|
||||
|
||||
The search will return the data requested in addition to the distance of each item.
|
||||
@@ -155,10 +155,10 @@ You can further filter the elements returned by a search using a where clause.
|
||||
tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas()
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/ann_indexes.ts:search2"
|
||||
--8<-- "docs/src/ann_indexes.ts:search2"
|
||||
```
|
||||
|
||||
### Projections (select clause)
|
||||
@@ -179,10 +179,10 @@ You can select the columns returned by the query using a select clause.
|
||||
...
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/ann_indexes.ts:search3"
|
||||
```typescript
|
||||
--8<-- "docs/src/ann_indexes.ts:search3"
|
||||
```
|
||||
|
||||
## FAQ
|
||||
@@ -212,7 +212,3 @@ On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows
|
||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
pip install lancedb
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```shell
|
||||
npm install vectordb
|
||||
@@ -24,6 +24,8 @@
|
||||
|
||||
=== "Rust"
|
||||
|
||||
!!! warning "Rust SDK is experimental, might introduce breaking changes in the near future"
|
||||
|
||||
```shell
|
||||
cargo add vectordb
|
||||
```
|
||||
@@ -54,12 +56,12 @@
|
||||
db = lancedb.connect(uri)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
--8<-- "src/basic_legacy.ts:import"
|
||||
--8<-- "docs/src/basic_legacy.ts:import"
|
||||
|
||||
--8<-- "src/basic_legacy.ts:open_db"
|
||||
--8<-- "docs/src/basic_legacy.ts:open_db"
|
||||
```
|
||||
|
||||
=== "Rust"
|
||||
@@ -67,7 +69,7 @@
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
--8<-- "src/basic.rs:connect"
|
||||
--8<-- "rust/vectordb/examples/simple.rs:connect"
|
||||
}
|
||||
```
|
||||
|
||||
@@ -100,10 +102,10 @@ If you need a reminder of the uri, you can call `db.uri()`.
|
||||
tbl = db.create_table("table_from_df", data=df)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/basic_legacy.ts:create_table"
|
||||
```typescript
|
||||
--8<-- "docs/src/basic_legacy.ts:create_table"
|
||||
```
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
@@ -116,22 +118,7 @@ If you need a reminder of the uri, you can call `db.uri()`.
|
||||
use arrow_schema::{DataType, Schema, Field};
|
||||
use arrow_array::{RecordBatch, RecordBatchIterator};
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("vector", DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)), 128), true),
|
||||
]));
|
||||
// Create a RecordBatch stream.
|
||||
let batches = RecordBatchIterator::new(vec![
|
||||
RecordBatch::try*new(schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..10)),
|
||||
Arc::new(FixedSizeListArray::from_iter_primitive::<Float32Type, *, _>(
|
||||
(0..10).map(|_| Some(vec![Some(1.0); 128])), 128)),
|
||||
]).unwrap()
|
||||
].into_iter().map(Ok),
|
||||
schema.clone());
|
||||
db.create_table("my_table", Box::new(batches), None).await.unwrap();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:create_table"
|
||||
```
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
@@ -151,24 +138,16 @@ In this case, you can create an empty table and specify the schema.
|
||||
tbl = db.create_table("empty_table", schema=schema)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
--8<-- "src/basic_legacy.ts:create_empty_table"
|
||||
--8<-- "docs/src/basic_legacy.ts:create_empty_table"
|
||||
```
|
||||
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
use arrow_schema::{Schema, Field, DataType};
|
||||
use arrow_array::{RecordBatch, RecordBatchIterator};
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("vector", DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)), 2), true),
|
||||
]));
|
||||
let batches = RecordBatchIterator::new(vec![].into_iter().map(Ok), schema);
|
||||
db.create_table("empty_table", Box::new(batches), None).await.unwrap();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:create_empty_table"
|
||||
```
|
||||
|
||||
## How to open an existing table
|
||||
@@ -181,7 +160,7 @@ Once created, you can open a table using the following code:
|
||||
tbl = db.open_table("my_table")
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```typescript
|
||||
const tbl = await db.openTable("myTable");
|
||||
@@ -190,7 +169,7 @@ Once created, you can open a table using the following code:
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
const tbl = db.open_table_with_params("myTable", None).await.unwrap();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:open_with_existing_file"
|
||||
```
|
||||
|
||||
If you forget the name of your table, you can always get a listing of all table names:
|
||||
@@ -210,7 +189,7 @@ If you forget the name of your table, you can always get a listing of all table
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
println!("{:?}", db.table_names().await.unwrap());
|
||||
--8<-- "rust/vectordb/examples/simple.rs:list_names"
|
||||
```
|
||||
|
||||
## How to add data to a table
|
||||
@@ -231,9 +210,9 @@ After a table has been created, you can always add more data to it using
|
||||
tbl.add(data)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
```typescript
|
||||
await tbl.add([{vector: [1.3, 1.4], item: "fizz", price: 100.0},
|
||||
{vector: [9.5, 56.2], item: "buzz", price: 200.0}])
|
||||
```
|
||||
@@ -241,8 +220,7 @@ After a table has been created, you can always add more data to it using
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
let batches = RecordBatchIterator::new(...);
|
||||
tbl.add(Box::new(batches), None).await.unwrap();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:add"
|
||||
```
|
||||
|
||||
## How to search for (approximate) nearest neighbors
|
||||
@@ -257,24 +235,18 @@ Once you've embedded the query, you can find its nearest neighbors using the fol
|
||||
|
||||
This returns a pandas DataFrame with the results.
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/basic_legacy.ts:search"
|
||||
```typescript
|
||||
--8<-- "docs/src/basic_legacy.ts:search"
|
||||
```
|
||||
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
use arrow_array::RecordBatch;
|
||||
use futures::TryStreamExt;
|
||||
|
||||
let results: Vec<RecordBatch> = tbl
|
||||
.search(&[100.0, 100.0])
|
||||
.execute_stream()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:search"
|
||||
```
|
||||
|
||||
By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN).
|
||||
@@ -286,7 +258,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s
|
||||
tbl.create_index()
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```{.typescript .ignore}
|
||||
await tbl.createIndex({})
|
||||
@@ -295,7 +267,7 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
tbl.create_index(&["vector"]).build().await.unwrap()
|
||||
--8<-- "rust/vectordb/examples/simple.rs:create_index"
|
||||
```
|
||||
|
||||
Check [Approximate Nearest Neighbor (ANN) Indexes](/ann_indices.md) section for more details.
|
||||
@@ -312,16 +284,16 @@ This can delete any number of rows that match the filter.
|
||||
tbl.delete('item = "fizz"')
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/basic_legacy.ts:delete"
|
||||
```typescript
|
||||
--8<-- "docs/src/basic_legacy.ts:delete"
|
||||
```
|
||||
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
tbl.delete("item = \"fizz\"").await.unwrap();
|
||||
--8<-- "rust/vectordb/examples/simple.rs:delete"
|
||||
```
|
||||
|
||||
The deletion predicate is a SQL expression that supports the same expressions
|
||||
@@ -350,10 +322,10 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
By default, if the table does not exist an exception is raised. To suppress this,
|
||||
you can pass in `ignore_missing=True`.
|
||||
|
||||
=== "JavaScript"
|
||||
=== "Typescript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/basic_legacy.ts:drop_table"
|
||||
```typescript
|
||||
--8<-- "docs/src/basic_legacy.ts:drop_table"
|
||||
```
|
||||
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
@@ -362,7 +334,7 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
db.drop_table("my_table").await.unwrap()
|
||||
--8<-- "rust/vectordb/examples/simple.rs:drop_table"
|
||||
```
|
||||
|
||||
!!! note "Bundling `vectordb` apps with Webpack"
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../rust/vectordb/examples/simple.rs
|
||||
@@ -56,9 +56,9 @@ db.create_table("my_vectors", data=data)
|
||||
=== "JavaScript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/search_legacy.ts:import"
|
||||
--8<-- "docs/src/search_legacy.ts:import"
|
||||
|
||||
--8<-- "src/search_legancy.ts:search1"
|
||||
--8<-- "docs/src/search_legacy.ts:search1"
|
||||
```
|
||||
|
||||
By default, `l2` will be used as metric type. You can specify the metric type as
|
||||
@@ -76,7 +76,7 @@ By default, `l2` will be used as metric type. You can specify the metric type as
|
||||
=== "JavaScript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/search_legacy.ts:search2"
|
||||
--8<-- "docs/src/search_legacy.ts:search2"
|
||||
```
|
||||
|
||||
## Approximate nearest neighbor (ANN) search
|
||||
|
||||
@@ -47,7 +47,7 @@ const tbl = await db.createTable('myVectors', data)
|
||||
=== "JavaScript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/sql_legacy.ts:search"
|
||||
--8<-- "docs/src/sql_legacy.ts:search"
|
||||
```
|
||||
|
||||
## SQL filters
|
||||
@@ -80,7 +80,7 @@ For example, the following filter string is acceptable:
|
||||
=== "Javascript"
|
||||
|
||||
```javascript
|
||||
--8<-- "src/sql_legacy.ts:vec_search"
|
||||
--8<-- "docs/src/sql_legacy.ts:vec_search"
|
||||
```
|
||||
|
||||
If your column name contains special characters or is a [SQL Keyword](https://docs.rs/sqlparser/latest/sqlparser/keywords/index.html),
|
||||
@@ -150,7 +150,7 @@ You can also filter your data without search.
|
||||
=== "JavaScript"
|
||||
|
||||
```javascript
|
||||
--8<---- "src/sql_legacy.ts:sql_search"
|
||||
--8<---- "docs/src/sql_legacy.ts:sql_search"
|
||||
```
|
||||
|
||||
!!!warning "If your table is large, this could potentially return a very large amount of data. Please be sure to use a `limit` clause unless you're sure you want to return the whole result set."
|
||||
|
||||
@@ -24,28 +24,64 @@ use vectordb::{connect, Result, Table, TableRef};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
if std::path::Path::new("data").exists() {
|
||||
std::fs::remove_dir_all("data").unwrap();
|
||||
}
|
||||
// --8<-- [start:connect]
|
||||
let uri = "data/sample-lancedb";
|
||||
let db = connect(uri).await?;
|
||||
// --8<-- [end:connect]
|
||||
let tbl = create_table(db).await?;
|
||||
|
||||
// --8<-- [start:list_names]
|
||||
println!("{:?}", db.table_names().await?);
|
||||
// --8<-- [end:list_names]
|
||||
let tbl = create_table(db.clone()).await?;
|
||||
create_index(tbl.as_ref()).await?;
|
||||
let batches = search(tbl.as_ref()).await?;
|
||||
println!("{:?}", batches);
|
||||
|
||||
create_empty_table(db.clone()).await.unwrap();
|
||||
|
||||
// --8<-- [start:delete]
|
||||
tbl.delete("id > 24").await.unwrap();
|
||||
// --8<-- [end:delete]
|
||||
|
||||
// --8<-- [start:drop_table]
|
||||
db.drop_table("my_table").await.unwrap();
|
||||
// --8<-- [end:drop_table]
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
async fn open_with_existing_tbl() -> Result<()> {
|
||||
let uri = "data/sample-lancedb";
|
||||
let db = connect(uri).await?;
|
||||
// --8<-- [start:open_with_existing_file]
|
||||
let _ = db
|
||||
.open_table_with_params("my_table", Default::default())
|
||||
.await
|
||||
.unwrap();
|
||||
// --8<-- [end:open_with_existing_file]
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_table(db: Arc<dyn Connection>) -> Result<TableRef> {
|
||||
// --8<-- [start:create_table]
|
||||
const TOTAL: usize = 1000;
|
||||
const DIM: usize = 128;
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new(
|
||||
"vector",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128),
|
||||
DataType::FixedSizeList(
|
||||
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||
DIM as i32,
|
||||
),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
const TOTAL: usize = 1000;
|
||||
const DIM: usize = 128;
|
||||
|
||||
// Create a RecordBatch stream.
|
||||
let batches = RecordBatchIterator::new(
|
||||
vec![RecordBatch::try_new(
|
||||
@@ -65,19 +101,62 @@ async fn create_table(db: Arc<dyn Connection>) -> Result<TableRef> {
|
||||
.map(Ok),
|
||||
schema.clone(),
|
||||
);
|
||||
db.create_table("my_table", Box::new(batches), None).await
|
||||
let tbl = db
|
||||
.create_table("my_table", Box::new(batches), None)
|
||||
.await
|
||||
.unwrap();
|
||||
// --8<-- [end:create_table]
|
||||
|
||||
let new_batches = RecordBatchIterator::new(
|
||||
vec![RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..TOTAL as i32)),
|
||||
Arc::new(
|
||||
FixedSizeListArray::from_iter_primitive::<Float32Type, _, _>(
|
||||
(0..TOTAL).map(|_| Some(vec![Some(1.0); DIM])),
|
||||
DIM as i32,
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
.unwrap()]
|
||||
.into_iter()
|
||||
.map(Ok),
|
||||
schema.clone(),
|
||||
);
|
||||
// --8<-- [start:add]
|
||||
tbl.add(Box::new(new_batches), None).await.unwrap();
|
||||
// --8<-- [end:add]
|
||||
|
||||
Ok(tbl)
|
||||
}
|
||||
|
||||
async fn create_empty_table(db: Arc<dyn Connection>) -> Result<TableRef> {
|
||||
// --8<-- [start:create_empty_table]
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("item", DataType::Utf8, true),
|
||||
]));
|
||||
let batches = RecordBatchIterator::new(vec![], schema.clone());
|
||||
db.create_table("empty_table", Box::new(batches), None)
|
||||
.await
|
||||
// --8<-- [end:create_empty_table]
|
||||
}
|
||||
|
||||
async fn create_index(table: &dyn Table) -> Result<()> {
|
||||
// --8<-- [start:create_index]
|
||||
table
|
||||
.create_index(&["vector"])
|
||||
.ivf_pq()
|
||||
.num_partitions(2)
|
||||
.num_partitions(8)
|
||||
.build()
|
||||
.await
|
||||
// --8<-- [end:create_index]
|
||||
}
|
||||
|
||||
async fn search(table: &dyn Table) -> Result<Vec<RecordBatch>> {
|
||||
// --8<-- [start:search]
|
||||
Ok(table
|
||||
.search(&[1.0; 128])
|
||||
.limit(2)
|
||||
@@ -85,4 +164,5 @@ async fn search(table: &dyn Table) -> Result<Vec<RecordBatch>> {
|
||||
.await?
|
||||
.try_collect::<Vec<_>>()
|
||||
.await?)
|
||||
// --8<-- [end:search]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user