feat: allow setting train=False and name on indices (#2586)

Enables two new parameters when building indices:

* `name`: Allows explicitly setting a name on the index. Default is
`{col_name}_idx`.
* `train` (default `True`): When set to `False`, an empty index will be
immediately created.

The upgrade of Lance means there are also additional behaviors from
cd76a993b8:

* When a scalar index is created on a Table, it will be kept around even
if all rows are deleted or updated.
* Scalar indices can be created on empty tables. They will default to
`train=False` if the table is empty.

---------

Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
Will Jones
2025-08-15 14:00:26 -07:00
committed by GitHub
parent 0c34ffb252
commit ad09234d59
14 changed files with 620 additions and 353 deletions

View File

@@ -59,6 +59,10 @@ class Table:
column: str,
index: Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS],
replace: Optional[bool],
wait_timeout: Optional[object],
*,
name: Optional[str],
train: Optional[bool],
): ...
async def list_versions(self) -> List[Dict[str, Any]]: ...
async def version(self) -> int: ...

View File

@@ -194,6 +194,8 @@ class RemoteTable(Table):
wait_timeout: Optional[timedelta] = None,
*,
num_bits: int = 8,
name: Optional[str] = None,
train: bool = True,
):
"""Create an index on the table.
Currently, the only parameters that matter are
@@ -270,7 +272,11 @@ class RemoteTable(Table):
LOOP.run(
self._table.create_index(
vector_column_name, config=config, wait_timeout=wait_timeout
vector_column_name,
config=config,
wait_timeout=wait_timeout,
name=name,
train=train,
)
)

View File

@@ -689,6 +689,8 @@ class Table(ABC):
sample_rate: int = 256,
m: int = 20,
ef_construction: int = 300,
name: Optional[str] = None,
train: bool = True,
):
"""Create an index on the table.
@@ -721,6 +723,11 @@ class Table(ABC):
Only 4 and 8 are supported.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
name: str, optional
The name of the index. If not provided, a default name will be generated.
train: bool, default True
Whether to train the index with existing data. Vector indices always train
with existing data.
"""
raise NotImplementedError
@@ -1929,6 +1936,9 @@ class LanceTable(Table):
sample_rate: int = 256,
m: int = 20,
ef_construction: int = 300,
*,
name: Optional[str] = None,
train: bool = True,
):
"""Create an index on the table."""
if accelerator is not None:
@@ -1992,6 +2002,8 @@ class LanceTable(Table):
vector_column_name,
replace=replace,
config=config,
name=name,
train=train,
)
)
@@ -3251,6 +3263,8 @@ class AsyncTable:
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
] = None,
wait_timeout: Optional[timedelta] = None,
name: Optional[str] = None,
train: bool = True,
):
"""Create an index to speed up queries
@@ -3277,6 +3291,11 @@ class AsyncTable:
creating an index object.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
name: str, optional
The name of the index. If not provided, a default name will be generated.
train: bool, default True
Whether to train the index with existing data. Vector indices always train
with existing data.
"""
if config is not None:
if not isinstance(
@@ -3288,7 +3307,12 @@ class AsyncTable:
)
try:
await self._inner.create_index(
column, index=config, replace=replace, wait_timeout=wait_timeout
column,
index=config,
replace=replace,
wait_timeout=wait_timeout,
name=name,
train=train,
)
except ValueError as e:
if "not support the requested language" in str(e):

View File

@@ -670,7 +670,9 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
num_sub_vectors=96,
num_bits=4,
)
mock_create_index.assert_called_with("vector", replace=True, config=expected_config)
mock_create_index.assert_called_with(
"vector", replace=True, config=expected_config, name=None, train=True
)
table.create_index(
vector_column_name="my_vector",
@@ -680,7 +682,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
)
expected_config = HnswPq(distance_type="dot")
mock_create_index.assert_called_with(
"my_vector", replace=False, config=expected_config
"my_vector", replace=False, config=expected_config, name=None, train=True
)
table.create_index(
@@ -695,7 +697,44 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10
)
mock_create_index.assert_called_with(
"my_vector", replace=True, config=expected_config
"my_vector", replace=True, config=expected_config, name=None, train=True
)
@patch("lancedb.table.AsyncTable.create_index")
def test_create_index_name_and_train_parameters(
mock_create_index, mem_db: DBConnection
):
"""Test that name and train parameters are passed correctly to AsyncTable"""
table = mem_db.create_table(
"test",
data=[
{"vector": [3.1, 4.1], "id": 1},
{"vector": [5.9, 26.5], "id": 2},
],
)
# Test with custom name
table.create_index(vector_column_name="vector", name="my_custom_index")
expected_config = IvfPq() # Default config
mock_create_index.assert_called_with(
"vector",
replace=True,
config=expected_config,
name="my_custom_index",
train=True,
)
# Test with train=False
table.create_index(vector_column_name="vector", train=False)
mock_create_index.assert_called_with(
"vector", replace=True, config=expected_config, name=None, train=False
)
# Test with both name and train
table.create_index(vector_column_name="vector", name="my_index_name", train=True)
mock_create_index.assert_called_with(
"vector", replace=True, config=expected_config, name="my_index_name", train=True
)

View File

@@ -341,13 +341,15 @@ impl Table {
})
}
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None, *, name=None, train=None))]
pub fn create_index<'a>(
self_: PyRef<'a, Self>,
column: String,
index: Option<Bound<'_, PyAny>>,
replace: Option<bool>,
wait_timeout: Option<Bound<'_, PyAny>>,
name: Option<String>,
train: Option<bool>,
) -> PyResult<Bound<'a, PyAny>> {
let index = extract_index_params(&index)?;
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
@@ -357,6 +359,12 @@ impl Table {
if let Some(replace) = replace {
op = op.replace(replace);
}
if let Some(name) = name {
op = op.name(name);
}
if let Some(train) = train {
op = op.train(train);
}
future_into_py(self_.py(), async move {
op.execute().await.infer_error()?;