mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-25 16:00:49 +00:00
feat: allow setting train=False and name on indices (#2586)
Enables two new parameters when building indices:
* `name`: Allows explicitly setting a name on the index. Default is
`{col_name}_idx`.
* `train` (default `True`): When set to `False`, an empty index will be
immediately created.
The upgrade of Lance means there are also additional behaviors from
cd76a993b8:
* When a scalar index is created on a Table, it will be kept around even
if all rows are deleted or updated.
* Scalar indices can be created on empty tables. They will default to
`train=False` if the table is empty.
---------
Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
66
CLAUDE.md
66
CLAUDE.md
@@ -13,10 +13,68 @@ Project layout:
|
||||
|
||||
Common commands:
|
||||
|
||||
* Check for compiler errors: `cargo check --features remote --tests --examples`
|
||||
* Run tests: `cargo test --features remote --tests`
|
||||
* Run specific test: `cargo test --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --features remote --tests --examples`
|
||||
* Check for compiler errors: `cargo check --quiet --features remote --tests --examples`
|
||||
* Run tests: `cargo test --quiet --features remote --tests`
|
||||
* Run specific test: `cargo test --quiet --features remote -p <package_name> --test <test_name>`
|
||||
* Lint: `cargo clippy --quiet --features remote --tests --examples`
|
||||
* Format: `cargo fmt --all`
|
||||
|
||||
Before committing changes, run formatting.
|
||||
|
||||
## Coding tips
|
||||
|
||||
* When writing Rust doctests for things that require a connection or table reference,
|
||||
write them as a function instead of a fully executable test. This allows type checking
|
||||
to run but avoids needing a full test environment. For example:
|
||||
```rust
|
||||
/// ```
|
||||
/// use lance_index::scalar::FullTextSearchQuery;
|
||||
/// use lancedb::query::{QueryBase, ExecutableQuery};
|
||||
///
|
||||
/// # use lancedb::Table;
|
||||
/// # async fn query(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let results = table.query()
|
||||
/// .full_text_search(FullTextSearchQuery::new("hello world".into()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
```
|
||||
|
||||
## Example plan: adding a new method on Table
|
||||
|
||||
Adding a new method involves first adding it to the Rust core, then exposing it
|
||||
in the Python and TypeScript bindings. There are both local and remote tables.
|
||||
Remote tables are implemented via a HTTP API and require the `remote` cargo
|
||||
feature flag to be enabled. Python has both sync and async methods.
|
||||
|
||||
Rust core changes:
|
||||
|
||||
1. Add method on `Table` struct in `rust/lancedb/src/table.rs` (calls `BaseTable` trait).
|
||||
2. Add method to `BaseTable` trait in `rust/lancedb/src/table.rs`.
|
||||
3. Implement new trait method on `NativeTable` in `rust/lancedb/src/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/table.rs`.
|
||||
4. Implement new trait method on `RemoteTable` in `rust/lancedb/src/remote/table.rs`.
|
||||
* Test with unit test in `rust/lancedb/src/remote/table.rs` against mocked endpoint.
|
||||
|
||||
Python bindings changes:
|
||||
|
||||
1. Add PyO3 method binding in `python/src/table.rs`. Run `make develop` to compile bindings.
|
||||
2. Add types for PyO3 method in `python/python/lancedb/_lancedb.pyi`.
|
||||
3. Add method to `AsyncTable` class in `python/python/lancedb/table.py`.
|
||||
4. Add abstract method to `Table` abstract base class in `python/python/lancedb/table.py`.
|
||||
5. Add concrete sync method to `LanceTable` class in `python/python/lancedb/table.py`.
|
||||
* Should use `LOOP.run()` to call the corresponding `AsyncTable` method.
|
||||
6. Add concrete sync method to `RemoteTable` class in `python/python/lancedb/remote/table.py`.
|
||||
7. Add unit test in `python/tests/test_table.py`.
|
||||
|
||||
TypeScript bindings changes:
|
||||
|
||||
1. Add napi-rs method binding on `Table` in `nodejs/src/table.rs`.
|
||||
2. Run `npm run build` to generate TypeScript definitions.
|
||||
3. Add typescript method on abstract class `Table` in `nodejs/src/table.ts`.
|
||||
4. Add concrete method on `LocalTable` class in `nodejs/src/native_table.ts`.
|
||||
* Note: despite the name, this class is also used for remote tables.
|
||||
5. Add test in `nodejs/__test__/table.test.ts`.
|
||||
6. Run `npm run docs` to generate TypeScript documentation.
|
||||
|
||||
@@ -26,6 +26,18 @@ will be used to determine the most useful kind of index to create.
|
||||
|
||||
***
|
||||
|
||||
### name?
|
||||
|
||||
```ts
|
||||
optional name: string;
|
||||
```
|
||||
|
||||
Optional custom name for the index.
|
||||
|
||||
If not provided, a default name will be generated based on the column name.
|
||||
|
||||
***
|
||||
|
||||
### replace?
|
||||
|
||||
```ts
|
||||
@@ -42,8 +54,27 @@ The default is true
|
||||
|
||||
***
|
||||
|
||||
### train?
|
||||
|
||||
```ts
|
||||
optional train: boolean;
|
||||
```
|
||||
|
||||
Whether to train the index with existing data.
|
||||
|
||||
If true (default), the index will be trained with existing data in the table.
|
||||
If false, the index will be created empty and populated as new data is added.
|
||||
|
||||
Note: This option is only supported for scalar indices. Vector indices always train.
|
||||
|
||||
***
|
||||
|
||||
### waitTimeoutSeconds?
|
||||
|
||||
```ts
|
||||
optional waitTimeoutSeconds: number;
|
||||
```
|
||||
|
||||
Timeout in seconds to wait for index creation to complete.
|
||||
|
||||
If not specified, the method will return immediately after starting the index creation.
|
||||
|
||||
@@ -857,6 +857,40 @@ describe("When creating an index", () => {
|
||||
expect(stats).toBeUndefined();
|
||||
});
|
||||
|
||||
test("should support name and train parameters", async () => {
|
||||
// Test with custom name
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfPq({ numPartitions: 4 }),
|
||||
name: "my_custom_vector_index",
|
||||
});
|
||||
|
||||
const indices = await tbl.listIndices();
|
||||
expect(indices).toHaveLength(1);
|
||||
expect(indices[0].name).toBe("my_custom_vector_index");
|
||||
|
||||
// Test scalar index with train=false
|
||||
await tbl.createIndex("id", {
|
||||
config: Index.btree(),
|
||||
name: "btree_empty",
|
||||
train: false,
|
||||
});
|
||||
|
||||
const allIndices = await tbl.listIndices();
|
||||
expect(allIndices).toHaveLength(2);
|
||||
expect(allIndices.some((idx) => idx.name === "btree_empty")).toBe(true);
|
||||
|
||||
// Test with both name and train=true (use tags column)
|
||||
await tbl.createIndex("tags", {
|
||||
config: Index.labelList(),
|
||||
name: "tags_trained",
|
||||
train: true,
|
||||
});
|
||||
|
||||
const finalIndices = await tbl.listIndices();
|
||||
expect(finalIndices).toHaveLength(3);
|
||||
expect(finalIndices.some((idx) => idx.name === "tags_trained")).toBe(true);
|
||||
});
|
||||
|
||||
test("create ivf_flat with binary vectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const binarySchema = new Schema([
|
||||
|
||||
@@ -700,5 +700,27 @@ export interface IndexOptions {
|
||||
*/
|
||||
replace?: boolean;
|
||||
|
||||
/**
|
||||
* Timeout in seconds to wait for index creation to complete.
|
||||
*
|
||||
* If not specified, the method will return immediately after starting the index creation.
|
||||
*/
|
||||
waitTimeoutSeconds?: number;
|
||||
|
||||
/**
|
||||
* Optional custom name for the index.
|
||||
*
|
||||
* If not provided, a default name will be generated based on the column name.
|
||||
*/
|
||||
name?: string;
|
||||
|
||||
/**
|
||||
* Whether to train the index with existing data.
|
||||
*
|
||||
* If true (default), the index will be trained with existing data in the table.
|
||||
* If false, the index will be created empty and populated as new data is added.
|
||||
*
|
||||
* Note: This option is only supported for scalar indices. Vector indices always train.
|
||||
*/
|
||||
train?: boolean;
|
||||
}
|
||||
|
||||
@@ -662,6 +662,8 @@ export class LocalTable extends Table {
|
||||
column,
|
||||
options?.replace,
|
||||
options?.waitTimeoutSeconds,
|
||||
options?.name,
|
||||
options?.train,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -114,6 +114,8 @@ impl Table {
|
||||
column: String,
|
||||
replace: Option<bool>,
|
||||
wait_timeout_s: Option<i64>,
|
||||
name: Option<String>,
|
||||
train: Option<bool>,
|
||||
) -> napi::Result<()> {
|
||||
let lancedb_index = if let Some(index) = index {
|
||||
index.consume()?
|
||||
@@ -128,6 +130,12 @@ impl Table {
|
||||
builder =
|
||||
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
|
||||
}
|
||||
if let Some(name) = name {
|
||||
builder = builder.name(name);
|
||||
}
|
||||
if let Some(train) = train {
|
||||
builder = builder.train(train);
|
||||
}
|
||||
builder.execute().await.default_error()
|
||||
}
|
||||
|
||||
|
||||
@@ -59,6 +59,10 @@ class Table:
|
||||
column: str,
|
||||
index: Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS],
|
||||
replace: Optional[bool],
|
||||
wait_timeout: Optional[object],
|
||||
*,
|
||||
name: Optional[str],
|
||||
train: Optional[bool],
|
||||
): ...
|
||||
async def list_versions(self) -> List[Dict[str, Any]]: ...
|
||||
async def version(self) -> int: ...
|
||||
|
||||
@@ -194,6 +194,8 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
*,
|
||||
num_bits: int = 8,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
Currently, the only parameters that matter are
|
||||
@@ -270,7 +272,11 @@ class RemoteTable(Table):
|
||||
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
vector_column_name, config=config, wait_timeout=wait_timeout
|
||||
vector_column_name,
|
||||
config=config,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -689,6 +689,8 @@ class Table(ABC):
|
||||
sample_rate: int = 256,
|
||||
m: int = 20,
|
||||
ef_construction: int = 300,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
|
||||
@@ -721,6 +723,11 @@ class Table(ABC):
|
||||
Only 4 and 8 are supported.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
train: bool, default True
|
||||
Whether to train the index with existing data. Vector indices always train
|
||||
with existing data.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1929,6 +1936,9 @@ class LanceTable(Table):
|
||||
sample_rate: int = 256,
|
||||
m: int = 20,
|
||||
ef_construction: int = 300,
|
||||
*,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index on the table."""
|
||||
if accelerator is not None:
|
||||
@@ -1992,6 +2002,8 @@ class LanceTable(Table):
|
||||
vector_column_name,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -3251,6 +3263,8 @@ class AsyncTable:
|
||||
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
|
||||
] = None,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
train: bool = True,
|
||||
):
|
||||
"""Create an index to speed up queries
|
||||
|
||||
@@ -3277,6 +3291,11 @@ class AsyncTable:
|
||||
creating an index object.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
train: bool, default True
|
||||
Whether to train the index with existing data. Vector indices always train
|
||||
with existing data.
|
||||
"""
|
||||
if config is not None:
|
||||
if not isinstance(
|
||||
@@ -3288,7 +3307,12 @@ class AsyncTable:
|
||||
)
|
||||
try:
|
||||
await self._inner.create_index(
|
||||
column, index=config, replace=replace, wait_timeout=wait_timeout
|
||||
column,
|
||||
index=config,
|
||||
replace=replace,
|
||||
wait_timeout=wait_timeout,
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
|
||||
@@ -670,7 +670,9 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
num_sub_vectors=96,
|
||||
num_bits=4,
|
||||
)
|
||||
mock_create_index.assert_called_with("vector", replace=True, config=expected_config)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
vector_column_name="my_vector",
|
||||
@@ -680,7 +682,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
)
|
||||
expected_config = HnswPq(distance_type="dot")
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=False, config=expected_config
|
||||
"my_vector", replace=False, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
table.create_index(
|
||||
@@ -695,7 +697,44 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
|
||||
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
|
||||
)
|
||||
mock_create_index.assert_called_with(
|
||||
"my_vector", replace=True, config=expected_config
|
||||
"my_vector", replace=True, config=expected_config, name=None, train=True
|
||||
)
|
||||
|
||||
|
||||
@patch("lancedb.table.AsyncTable.create_index")
|
||||
def test_create_index_name_and_train_parameters(
|
||||
mock_create_index, mem_db: DBConnection
|
||||
):
|
||||
"""Test that name and train parameters are passed correctly to AsyncTable"""
|
||||
table = mem_db.create_table(
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "id": 1},
|
||||
{"vector": [5.9, 26.5], "id": 2},
|
||||
],
|
||||
)
|
||||
|
||||
# Test with custom name
|
||||
table.create_index(vector_column_name="vector", name="my_custom_index")
|
||||
expected_config = IvfPq() # Default config
|
||||
mock_create_index.assert_called_with(
|
||||
"vector",
|
||||
replace=True,
|
||||
config=expected_config,
|
||||
name="my_custom_index",
|
||||
train=True,
|
||||
)
|
||||
|
||||
# Test with train=False
|
||||
table.create_index(vector_column_name="vector", train=False)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name=None, train=False
|
||||
)
|
||||
|
||||
# Test with both name and train
|
||||
table.create_index(vector_column_name="vector", name="my_index_name", train=True)
|
||||
mock_create_index.assert_called_with(
|
||||
"vector", replace=True, config=expected_config, name="my_index_name", train=True
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -341,13 +341,15 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
|
||||
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None, *, name=None, train=None))]
|
||||
pub fn create_index<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
column: String,
|
||||
index: Option<Bound<'_, PyAny>>,
|
||||
replace: Option<bool>,
|
||||
wait_timeout: Option<Bound<'_, PyAny>>,
|
||||
name: Option<String>,
|
||||
train: Option<bool>,
|
||||
) -> PyResult<Bound<'a, PyAny>> {
|
||||
let index = extract_index_params(&index)?;
|
||||
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
|
||||
@@ -357,6 +359,12 @@ impl Table {
|
||||
if let Some(replace) = replace {
|
||||
op = op.replace(replace);
|
||||
}
|
||||
if let Some(name) = name {
|
||||
op = op.name(name);
|
||||
}
|
||||
if let Some(train) = train {
|
||||
op = op.train(train);
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
op.execute().await.infer_error()?;
|
||||
|
||||
@@ -65,12 +65,94 @@ pub enum Index {
|
||||
/// Builder for the create_index operation
|
||||
///
|
||||
/// The methods on this builder are used to specify options common to all indices.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Creating a basic vector index:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_basic_vector_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a vector index with default settings
|
||||
/// table
|
||||
/// .create_index(&["vector"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating an index with a custom name:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, vector::IvfPqIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_named_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a vector index with a custom name
|
||||
/// table
|
||||
/// .create_index(&["embeddings"], Index::IvfPq(IvfPqIndexBuilder::default()))
|
||||
/// .name("my_embeddings_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating an untrained index (for scalar indices only):
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_untrained_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a BTree index without training (creates empty index)
|
||||
/// table
|
||||
/// .create_index(&["category"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .train(false)
|
||||
/// .name("category_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating a scalar index with all options:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||
///
|
||||
/// # async fn create_full_options_index() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a bitmap index with full configuration
|
||||
/// table
|
||||
/// .create_index(&["status"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||
/// .name("status_bitmap_index".to_string())
|
||||
/// .train(true) // Train the index with existing data
|
||||
/// .replace(false) // Don't replace if index already exists
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct IndexBuilder {
|
||||
parent: Arc<dyn BaseTable>,
|
||||
pub(crate) index: Index,
|
||||
pub(crate) columns: Vec<String>,
|
||||
pub(crate) replace: bool,
|
||||
pub(crate) wait_timeout: Option<Duration>,
|
||||
pub(crate) train: bool,
|
||||
pub(crate) name: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexBuilder {
|
||||
@@ -80,7 +162,9 @@ impl IndexBuilder {
|
||||
index,
|
||||
columns,
|
||||
replace: true,
|
||||
train: true,
|
||||
wait_timeout: None,
|
||||
name: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,6 +178,82 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// The name of the index. If not set, a default name will be generated.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn name_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create an index with a custom name
|
||||
/// table
|
||||
/// .create_index(&["user_id"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .name("user_id_btree_index".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn name(mut self, v: String) -> Self {
|
||||
self.name = Some(v);
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to train the index, the default is `true`.
|
||||
///
|
||||
/// If this is false, the index will not be trained and just created empty.
|
||||
///
|
||||
/// This is not supported for vector indices yet.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Creating an empty index that will be populated later:
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BitmapIndexBuilder}};
|
||||
///
|
||||
/// # async fn train_false_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create an empty bitmap index (not trained with existing data)
|
||||
/// table
|
||||
/// .create_index(&["category"], Index::Bitmap(BitmapIndexBuilder::default()))
|
||||
/// .train(false) // Create empty index
|
||||
/// .name("category_bitmap".to_string())
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Creating a trained index (default behavior):
|
||||
///
|
||||
/// ```
|
||||
/// use lancedb::{connect, index::{Index, scalar::BTreeIndexBuilder}};
|
||||
///
|
||||
/// # async fn train_true_example() -> lancedb::Result<()> {
|
||||
/// let db = connect("data/sample-lancedb").execute().await?;
|
||||
/// let table = db.open_table("my_table").execute().await?;
|
||||
///
|
||||
/// // Create a trained BTree index (includes existing data)
|
||||
/// table
|
||||
/// .create_index(&["timestamp"], Index::BTree(BTreeIndexBuilder::default()))
|
||||
/// .train(true) // Train with existing data (this is the default)
|
||||
/// .execute()
|
||||
/// .await?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn train(mut self, v: bool) -> Self {
|
||||
self.train = v;
|
||||
self
|
||||
}
|
||||
|
||||
/// Duration of time to wait for asynchronous indexing to complete. If not set,
|
||||
/// `create_index()` will not wait.
|
||||
///
|
||||
|
||||
@@ -999,6 +999,18 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
"column": column
|
||||
});
|
||||
|
||||
// Add name parameter if provided (for backwards compatibility, only include if Some)
|
||||
if let Some(ref name) = index.name {
|
||||
body["name"] = serde_json::Value::String(name.clone());
|
||||
}
|
||||
|
||||
// Warn if train=false is specified since it's not meaningful
|
||||
if !index.train {
|
||||
log::warn!(
|
||||
"train=false has no effect remote tables. The index will be created empty and automatically populated in the background."
|
||||
);
|
||||
}
|
||||
|
||||
match index.index {
|
||||
// TODO: Should we pass the actual index parameters? SaaS does not
|
||||
// yet support them.
|
||||
@@ -1084,8 +1096,8 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
|
||||
self.check_table_response(&request_id, response).await?;
|
||||
|
||||
if let Some(wait_timeout) = index.wait_timeout {
|
||||
let name = format!("{}_idx", column);
|
||||
self.wait_for_index(&[&name], wait_timeout).await?;
|
||||
let index_name = index.name.unwrap_or_else(|| format!("{}_idx", column));
|
||||
self.wait_for_index(&[&index_name], wait_timeout).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -28,9 +28,11 @@ use lance::dataset::{
|
||||
};
|
||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||
use lance::index::vector::utils::infer_vector_dim;
|
||||
use lance::index::vector::VectorIndexParams;
|
||||
use lance::io::WrappingObjectStore;
|
||||
use lance_datafusion::exec::{analyze_plan as lance_analyze_plan, execute_plan};
|
||||
use lance_datafusion::utils::StreamingWriteSource;
|
||||
use lance_index::scalar::{ScalarIndexParams, ScalarIndexType};
|
||||
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
||||
use lance_index::vector::ivf::IvfBuildParams;
|
||||
use lance_index::vector::pq::PQBuildParams;
|
||||
@@ -50,11 +52,7 @@ use crate::arrow::IntoArrow;
|
||||
use crate::connection::NoData;
|
||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::index::scalar::FtsIndexBuilder;
|
||||
use crate::index::vector::{
|
||||
suggested_num_partitions_for_hnsw, IvfFlatIndexBuilder, IvfHnswPqIndexBuilder,
|
||||
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
||||
};
|
||||
use crate::index::vector::{suggested_num_partitions_for_hnsw, VectorIndex};
|
||||
use crate::index::IndexStatistics;
|
||||
use crate::index::{
|
||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||
@@ -1698,345 +1696,211 @@ impl NativeTable {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn create_ivf_flat_index(
|
||||
&self,
|
||||
index: IvfFlatIndexBuilder,
|
||||
// Helper to validate index type compatibility with field data type
|
||||
fn validate_index_type(
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
index_name: &str,
|
||||
supported_fn: impl Fn(&DataType) -> bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
if !supported_fn(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"An IVF Flat index cannot be created on the column `{}` which has data type {}",
|
||||
"A {} index cannot be created on the field `{}` which has data type {}",
|
||||
index_name,
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let num_partitions = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
suggested_num_partitions(self.count_rows(None).await?)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_flat(
|
||||
num_partitions as usize,
|
||||
index.distance_type.into(),
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_pq_index(
|
||||
// Helper to get num_partitions with default calculation
|
||||
async fn get_num_partitions(
|
||||
&self,
|
||||
index: IvfPqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF PQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
provided: Option<u32>,
|
||||
for_hnsw: bool,
|
||||
dim: Option<u32>,
|
||||
) -> Result<u32> {
|
||||
if let Some(n) = provided {
|
||||
Ok(n)
|
||||
} else {
|
||||
let row_count = self.count_rows(None).await?;
|
||||
if for_hnsw {
|
||||
Ok(suggested_num_partitions_for_hnsw(
|
||||
row_count,
|
||||
dim.ok_or_else(|| Error::InvalidInput {
|
||||
message: "Vector dimension required for HNSW partitioning".to_string(),
|
||||
})?,
|
||||
))
|
||||
} else {
|
||||
Ok(suggested_num_partitions(row_count))
|
||||
}
|
||||
}
|
||||
|
||||
let num_partitions = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
suggested_num_partitions(self.count_rows(None).await?)
|
||||
};
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
let dim = infer_vector_dim(field.data_type())?;
|
||||
suggested_num_sub_vectors(dim as u32)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
index.distance_type.into(),
|
||||
index.max_iterations as usize,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_hnsw_pq_index(
|
||||
&self,
|
||||
index: IvfHnswPqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF HNSW PQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
// Helper to get num_sub_vectors with default calculation
|
||||
fn get_num_sub_vectors(provided: Option<u32>, dim: u32) -> u32 {
|
||||
provided.unwrap_or_else(|| suggested_num_sub_vectors(dim))
|
||||
}
|
||||
|
||||
// Helper to extract vector dimension from field
|
||||
fn get_vector_dimension(field: &Field) -> Result<u32> {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok(*n as u32),
|
||||
_ => Ok(infer_vector_dim(field.data_type())? as u32),
|
||||
}
|
||||
}
|
||||
|
||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
||||
),
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => {
|
||||
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
|
||||
// Convert LanceDB Index to Lance IndexParams
|
||||
async fn make_index_params(
|
||||
&self,
|
||||
field: &Field,
|
||||
index_opts: Index,
|
||||
) -> Result<Box<dyn lance::index::IndexParams>> {
|
||||
match index_opts {
|
||||
Index::Auto => {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
// Use IvfPq as the default for auto vector indices
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self.get_num_partitions(None, false, None).await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(None, dim);
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
lance_linalg::distance::MetricType::L2,
|
||||
/*max_iterations=*/ 50,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::BTree)))
|
||||
} else {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"there are no indices supported for the field `{}` with the data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let pq_params = PQBuildParams {
|
||||
num_sub_vectors: num_sub_vectors as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_pq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
pq_params,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_ivf_hnsw_sq_index(
|
||||
&self,
|
||||
index: IvfHnswSqIndexBuilder,
|
||||
field: &Field,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_vector_data_type(field.data_type()) {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"An IVF HNSW SQ index cannot be created on the column `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let num_partitions: u32 = if let Some(n) = index.num_partitions {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
|
||||
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
|
||||
),
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
};
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let sq_params = SQBuildParams {
|
||||
sample_rate: index.sample_rate as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
sq_params,
|
||||
);
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Vector,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
||||
.await
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
self.create_btree_index(field, opts).await
|
||||
} else {
|
||||
Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"there are no indices supported for the field `{}` with the data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
})
|
||||
}
|
||||
Index::BTree(_) => {
|
||||
Self::validate_index_type(field, "BTree", supported_btree_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::BTree)))
|
||||
}
|
||||
Index::Bitmap(_) => {
|
||||
Self::validate_index_type(field, "Bitmap", supported_bitmap_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::Bitmap)))
|
||||
}
|
||||
Index::LabelList(_) => {
|
||||
Self::validate_index_type(field, "LabelList", supported_label_list_data_type)?;
|
||||
Ok(Box::new(ScalarIndexParams::new(ScalarIndexType::LabelList)))
|
||||
}
|
||||
Index::FTS(fts_opts) => {
|
||||
Self::validate_index_type(field, "FTS", supported_fts_data_type)?;
|
||||
Ok(Box::new(fts_opts))
|
||||
}
|
||||
Index::IvfFlat(index) => {
|
||||
Self::validate_index_type(field, "IVF Flat", supported_vector_data_type)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, false, None)
|
||||
.await?;
|
||||
let lance_idx_params = VectorIndexParams::ivf_flat(
|
||||
num_partitions as usize,
|
||||
index.distance_type.into(),
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfPq(index) => {
|
||||
Self::validate_index_type(field, "IVF PQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, false, None)
|
||||
.await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||
let lance_idx_params = VectorIndexParams::ivf_pq(
|
||||
num_partitions as usize,
|
||||
/*num_bits=*/ 8,
|
||||
num_sub_vectors as usize,
|
||||
index.distance_type.into(),
|
||||
index.max_iterations as usize,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfHnswPq(index) => {
|
||||
Self::validate_index_type(field, "IVF HNSW PQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||
.await?;
|
||||
let num_sub_vectors = Self::get_num_sub_vectors(index.num_sub_vectors, dim);
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let pq_params = PQBuildParams {
|
||||
num_sub_vectors: num_sub_vectors as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_pq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
pq_params,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
Index::IvfHnswSq(index) => {
|
||||
Self::validate_index_type(field, "IVF HNSW SQ", supported_vector_data_type)?;
|
||||
let dim = Self::get_vector_dimension(field)?;
|
||||
let num_partitions = self
|
||||
.get_num_partitions(index.num_partitions, true, Some(dim))
|
||||
.await?;
|
||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||
ivf_params.sample_rate = index.sample_rate as usize;
|
||||
ivf_params.max_iters = index.max_iterations as usize;
|
||||
let hnsw_params = HnswBuildParams::default()
|
||||
.num_edges(index.m as usize)
|
||||
.ef_construction(index.ef_construction as usize);
|
||||
let sq_params = SQBuildParams {
|
||||
sample_rate: index.sample_rate as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let lance_idx_params = VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||
index.distance_type.into(),
|
||||
ivf_params,
|
||||
hnsw_params,
|
||||
sq_params,
|
||||
);
|
||||
Ok(Box::new(lance_idx_params))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_btree_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_btree_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A BTree index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
// Helper method to get the correct IndexType based on the Index variant and field data type
|
||||
fn get_index_type_for_field(&self, field: &Field, index: &Index) -> IndexType {
|
||||
match index {
|
||||
Index::Auto => {
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
IndexType::Vector
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
IndexType::BTree
|
||||
} else {
|
||||
// This should not happen since make_index_params would have failed
|
||||
IndexType::BTree
|
||||
}
|
||||
}
|
||||
Index::BTree(_) => IndexType::BTree,
|
||||
Index::Bitmap(_) => IndexType::Bitmap,
|
||||
Index::LabelList(_) => IndexType::LabelList,
|
||||
Index::FTS(_) => IndexType::Inverted,
|
||||
Index::IvfFlat(_) | Index::IvfPq(_) | Index::IvfHnswPq(_) | Index::IvfHnswSq(_) => {
|
||||
IndexType::Vector
|
||||
}
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::BTree),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::BTree,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_bitmap_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_bitmap_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A Bitmap index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::Bitmap),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Bitmap,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_label_list_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
if !supported_label_list_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A LabelList index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::LabelList),
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::LabelList,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_fts_index(
|
||||
&self,
|
||||
field: &Field,
|
||||
fts_opts: FtsIndexBuilder,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !supported_fts_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
"A FTS index cannot be created on the field `{}` which has data type {}",
|
||||
field.name(),
|
||||
field.data_type()
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Inverted,
|
||||
None,
|
||||
&fts_opts,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn generic_query(
|
||||
@@ -2251,26 +2115,20 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let field = schema.field_with_name(&opts.columns[0])?;
|
||||
|
||||
match opts.index {
|
||||
Index::Auto => self.create_auto_index(field, opts).await,
|
||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
||||
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
||||
Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
|
||||
Index::IvfFlat(ivf_flat) => {
|
||||
self.create_ivf_flat_index(ivf_flat, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
||||
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
Index::IvfHnswSq(ivf_hnsw_sq) => {
|
||||
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
|
||||
.await
|
||||
}
|
||||
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||
let columns = [field.name().as_str()];
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let mut builder = dataset
|
||||
.create_index_builder(&columns, index_type, lance_idx_params.as_ref())
|
||||
.train(opts.train)
|
||||
.replace(opts.replace);
|
||||
|
||||
if let Some(name) = opts.name {
|
||||
builder = builder.name(name);
|
||||
}
|
||||
builder.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_index(&self, index_name: &str) -> Result<()> {
|
||||
@@ -2890,6 +2748,7 @@ mod tests {
|
||||
use crate::connect;
|
||||
use crate::connection::ConnectBuilder;
|
||||
use crate::index::scalar::{BTreeIndexBuilder, BitmapIndexBuilder};
|
||||
use crate::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder};
|
||||
use crate::query::{ExecutableQuery, QueryBase};
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
Reference in New Issue
Block a user