feat: add new table API to wait for async indexing (#2338)

* Add new wait_for_index() table operation that polls until indices are
created/fully indexed
* Add an optional wait timeout parameter to all create_index operations
* Python and NodeJS interfaces

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **New Features**
- Added optional waiting for index creation completion with configurable
timeout.
- Introduced methods to poll and wait for indices to be fully built
across sync and async tables.
  - Extended index creation APIs to accept a wait timeout parameter.
- **Bug Fixes**
- Added a new timeout error variant for improved error reporting on
index operations.
- **Tests**
- Added tests covering successful index readiness waiting, timeout
scenarios, and missing index cases.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Ryan Green
2025-04-21 08:41:21 -02:30
committed by GitHub
parent 4f07fea6df
commit 3ae90dde80
16 changed files with 582 additions and 33 deletions

View File

@@ -104,6 +104,7 @@ class RemoteTable(Table):
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
*,
replace: bool = False,
wait_timeout: timedelta = None,
):
"""Creates a scalar index
Parameters
@@ -126,13 +127,18 @@ class RemoteTable(Table):
else:
raise ValueError(f"Unknown index type: {index_type}")
LOOP.run(self._table.create_index(column, config=config, replace=replace))
LOOP.run(
self._table.create_index(
column, config=config, replace=replace, wait_timeout=wait_timeout
)
)
def create_fts_index(
self,
column: str,
*,
replace: bool = False,
wait_timeout: timedelta = None,
with_position: bool = True,
# tokenizer configs:
base_tokenizer: str = "simple",
@@ -153,7 +159,11 @@ class RemoteTable(Table):
remove_stop_words=remove_stop_words,
ascii_folding=ascii_folding,
)
LOOP.run(self._table.create_index(column, config=config, replace=replace))
LOOP.run(
self._table.create_index(
column, config=config, replace=replace, wait_timeout=wait_timeout
)
)
def create_index(
self,
@@ -165,6 +175,7 @@ class RemoteTable(Table):
replace: Optional[bool] = None,
accelerator: Optional[str] = None,
index_type="vector",
wait_timeout: Optional[timedelta] = None,
):
"""Create an index on the table.
Currently, the only parameters that matter are
@@ -236,7 +247,11 @@ class RemoteTable(Table):
" 'IVF_FLAT', 'IVF_PQ', 'IVF_HNSW_PQ', 'IVF_HNSW_SQ'"
)
LOOP.run(self._table.create_index(vector_column_name, config=config))
LOOP.run(
self._table.create_index(
vector_column_name, config=config, wait_timeout=wait_timeout
)
)
def add(
self,
@@ -554,6 +569,11 @@ class RemoteTable(Table):
def drop_index(self, index_name: str):
return LOOP.run(self._table.drop_index(index_name))
def wait_for_index(
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
):
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def uses_v2_manifest_paths(self) -> bool:
raise NotImplementedError(
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"

View File

@@ -631,6 +631,7 @@ class Table(ABC):
index_cache_size: Optional[int] = None,
*,
index_type: VectorIndexType = "IVF_PQ",
wait_timeout: Optional[timedelta] = None,
num_bits: int = 8,
max_iterations: int = 50,
sample_rate: int = 256,
@@ -666,6 +667,8 @@ class Table(ABC):
num_bits: int
The number of bits to encode sub-vectors. Only used with the IVF_PQ index.
Only 4 and 8 are supported.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
"""
raise NotImplementedError
@@ -689,6 +692,23 @@ class Table(ABC):
"""
raise NotImplementedError
def wait_for_index(
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
) -> None:
"""
Wait for indexing to complete for the given index names.
This will poll the table until all the indices are fully indexed,
or raise a timeout exception if the timeout is reached.
Parameters
----------
index_names: str
The name of the indices to poll
timeout: timedelta
Timeout to wait for asynchronous indexing. The default is 5 minutes.
"""
raise NotImplementedError
@abstractmethod
def create_scalar_index(
self,
@@ -696,6 +716,7 @@ class Table(ABC):
*,
replace: bool = True,
index_type: ScalarIndexType = "BTREE",
wait_timeout: Optional[timedelta] = None,
):
"""Create a scalar index on a column.
@@ -708,7 +729,8 @@ class Table(ABC):
Replace the existing index if it exists.
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"
The type of index to create.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
Examples
--------
@@ -767,6 +789,7 @@ class Table(ABC):
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
wait_timeout: Optional[timedelta] = None,
):
"""Create a full-text search index on the table.
@@ -822,6 +845,8 @@ class Table(ABC):
ascii_folding : bool, default False
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
"""
raise NotImplementedError
@@ -1771,6 +1796,11 @@ class LanceTable(Table):
"""
return LOOP.run(self._table.prewarm_index(name))
def wait_for_index(
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
) -> None:
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def create_scalar_index(
self,
column: str,
@@ -2964,6 +2994,7 @@ class AsyncTable:
config: Optional[
Union[IvfFlat, IvfPq, HnswPq, HnswSq, BTree, Bitmap, LabelList, FTS]
] = None,
wait_timeout: Optional[timedelta] = None,
):
"""Create an index to speed up queries
@@ -2988,6 +3019,8 @@ class AsyncTable:
For advanced configuration you can specify the type of index you would
like to create. You can also specify index-specific parameters when
creating an index object.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
"""
if config is not None:
if not isinstance(
@@ -2998,7 +3031,9 @@ class AsyncTable:
" Bitmap, LabelList, or FTS"
)
try:
await self._inner.create_index(column, index=config, replace=replace)
await self._inner.create_index(
column, index=config, replace=replace, wait_timeout=wait_timeout
)
except ValueError as e:
if "not support the requested language" in str(e):
supported_langs = ", ".join(lang_mapping.values())
@@ -3043,6 +3078,23 @@ class AsyncTable:
"""
await self._inner.prewarm_index(name)
async def wait_for_index(
self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300)
) -> None:
"""
Wait for indexing to complete for the given index names.
This will poll the table until all the indices are fully indexed,
or raise a timeout exception if the timeout is reached.
Parameters
----------
index_names: str
The name of the indices to poll
timeout: timedelta
Timeout to wait for asynchronous indexing. The default is 5 minutes.
"""
await self._inner.wait_for_index(index_names, timeout)
async def add(
self,
data: DATA,

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import re
from concurrent.futures import ThreadPoolExecutor
import contextlib
from datetime import timedelta
@@ -235,6 +235,10 @@ def test_table_add_in_threadpool():
def test_table_create_indices():
def handler(request):
index_stats = dict(
index_type="IVF_PQ", num_indexed_rows=1000, num_unindexed_rows=0
)
if request.path == "/v1/table/test/create_index/":
request.send_response(200)
request.end_headers()
@@ -258,6 +262,47 @@ def test_table_create_indices():
)
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/list/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
dict(
indexes=[
{
"index_name": "id_idx",
"columns": ["id"],
},
{
"index_name": "text_idx",
"columns": ["text"],
},
{
"index_name": "vector_idx",
"columns": ["vector"],
},
]
)
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/id_idx/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(index_stats)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/text_idx/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(index_stats)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/vector_idx/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(index_stats)
request.wfile.write(payload.encode())
elif "/drop/" in request.path:
request.send_response(200)
request.end_headers()
@@ -269,14 +314,81 @@ def test_table_create_indices():
# Parameters are well-tested through local and async tests.
# This is a smoke-test.
table = db.create_table("test", [{"id": 1}])
table.create_scalar_index("id")
table.create_fts_index("text")
table.create_scalar_index("vector")
table.create_scalar_index("id", wait_timeout=timedelta(seconds=2))
table.create_fts_index("text", wait_timeout=timedelta(seconds=2))
table.create_index(
vector_column_name="vector", wait_timeout=timedelta(seconds=10)
)
table.wait_for_index(["id_idx"], timedelta(seconds=2))
table.wait_for_index(["text_idx", "vector_idx"], timedelta(seconds=2))
table.drop_index("vector_idx")
table.drop_index("id_idx")
table.drop_index("text_idx")
def test_table_wait_for_index_timeout():
def handler(request):
index_stats = dict(
index_type="BTREE", num_indexed_rows=1000, num_unindexed_rows=1
)
if request.path == "/v1/table/test/create/?mode=create":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"{}")
elif request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
dict(
version=1,
schema=dict(
fields=[
dict(name="id", type={"type": "int64"}, nullable=False),
]
),
)
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/list/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
dict(
indexes=[
{
"index_name": "id_idx",
"columns": ["id"],
},
]
)
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/index/id_idx/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(index_stats)
print(f"{index_stats=}")
request.wfile.write(payload.encode())
else:
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
table = db.create_table("test", [{"id": 1}])
with pytest.raises(
RuntimeError,
match=re.escape(
'Timeout error: timed out waiting for indices: ["id_idx"] after 1s'
),
):
table.wait_for_index(["id_idx"], timedelta(seconds=1))
@contextlib.contextmanager
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
def handler(request):

View File

@@ -177,15 +177,19 @@ impl Table {
})
}
#[pyo3(signature = (column, index=None, replace=None))]
#[pyo3(signature = (column, index=None, replace=None, wait_timeout=None))]
pub fn create_index<'a>(
self_: PyRef<'a, Self>,
column: String,
index: Option<Bound<'_, PyAny>>,
replace: Option<bool>,
wait_timeout: Option<Bound<'_, PyAny>>,
) -> PyResult<Bound<'a, PyAny>> {
let index = extract_index_params(&index)?;
let mut op = self_.inner_ref()?.create_index(&[column], index);
let timeout = wait_timeout.map(|t| t.extract::<std::time::Duration>().unwrap());
let mut op = self_
.inner_ref()?
.create_index_with_timeout(&[column], index, timeout);
if let Some(replace) = replace {
op = op.replace(replace);
}
@@ -204,6 +208,26 @@ impl Table {
})
}
pub fn wait_for_index<'a>(
self_: PyRef<'a, Self>,
index_names: Vec<String>,
timeout: Bound<'_, PyAny>,
) -> PyResult<Bound<'a, PyAny>> {
let inner = self_.inner_ref()?.clone();
let timeout = timeout.extract::<std::time::Duration>()?;
future_into_py(self_.py(), async move {
let index_refs = index_names
.iter()
.map(String::as_str)
.collect::<Vec<&str>>();
inner
.wait_for_index(&index_refs, timeout)
.await
.infer_error()?;
Ok(())
})
}
pub fn prewarm_index(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {