From f951da2b00099360122dba67734cccae30e30337 Mon Sep 17 00:00:00 2001 From: Esteban Gutierrez Date: Tue, 10 Mar 2026 15:39:39 -0500 Subject: [PATCH] feat: support prewarm_index and prewarm_data on remote tables (#3110) ## Summary - Implement `RemoteTable.prewarm_data(columns)` calling `POST /v1/table/{id}/page_cache/prewarm/` - Implement `RemoteTable.prewarm_index(name)` calling `POST /v1/table/{id}/index/{name}/prewarm/` (previously returned `NotSupported`) - Add `BaseTable::prewarm_data(columns)` trait method and `Table` public API in Rust core - Add PyO3 bindings and Python API (`AsyncTable`, `LanceTable`, `RemoteTable`) for `prewarm_data` - Add type stubs for `prewarm_index` and `prewarm_data` in `_lancedb.pyi` - Upgrade Lance to 3.0.0-rc.3 with breaking change fixes Co-authored-by: Will Jones Co-authored-by: Claude Opus 4.6 --- Cargo.lock | 12 ++++ python/python/lancedb/_lancedb.pyi | 2 + python/python/lancedb/remote/table.py | 39 ++++++++++++ python/python/lancedb/table.py | 77 ++++++++++++++++++++--- python/src/table.rs | 11 ++++ rust/lancedb/src/remote/client.rs | 9 +-- rust/lancedb/src/remote/table.rs | 89 +++++++++++++++++++++++++-- rust/lancedb/src/table.rs | 50 ++++++++++++--- 8 files changed, 261 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a06b43ac7..899488e78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7753,6 +7753,18 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "snafu-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54254b8531cafa275c5e096f62d48c81435d1015405a91198ddb11e967301d40" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "socket2" version = "0.5.10" diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 60a4aae60..c5b35c945 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -166,6 +166,8 @@ class Table: async def checkout(self, version: Union[int, str]): ... async def checkout_latest(self): ... async def restore(self, version: Optional[Union[int, str]] = None): ... + async def prewarm_index(self, index_name: str) -> None: ... + async def prewarm_data(self, columns: Optional[List[str]] = None) -> None: ... async def list_indices(self) -> list[IndexConfig]: ... async def delete(self, filter: str) -> DeleteResult: ... async def add_columns(self, columns: list[tuple[str, str]]) -> AddColumnsResult: ... diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 84a067771..4dd5b428f 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -640,6 +640,45 @@ class RemoteTable(Table): def drop_index(self, index_name: str): return LOOP.run(self._table.drop_index(index_name)) + def prewarm_index(self, name: str) -> None: + """Prewarm an index in the table. + + This is a hint to the database that the index will be accessed in the + future and should be loaded into memory if possible. This can reduce + cold-start latency for subsequent queries. + + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + Parameters + ---------- + name: str + The name of the index to prewarm + """ + return LOOP.run(self._table.prewarm_index(name)) + + def prewarm_data(self, columns: Optional[List[str]] = None) -> None: + """Prewarm data for the table. + + This is a hint to the database that the given columns will be accessed + in the future and the database should prefetch the data if possible. + Currently only supported on remote tables. + + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + This operation has a large upfront cost but can speed up future queries + that need to fetch the given columns. Large columns such as embeddings + or binary data may not be practical to prewarm. This feature is intended + for workloads that issue many queries against the same columns. + + Parameters + ---------- + columns: list of str, optional + The columns to prewarm. If None, all columns are prewarmed. + """ + return LOOP.run(self._table.prewarm_data(columns)) + def wait_for_index( self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300) ): diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 4d3fd3a8a..acaf534f5 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2219,12 +2219,18 @@ class LanceTable(Table): def prewarm_index(self, name: str) -> None: """ - Prewarms an index in the table + Prewarm an index in the table. - This loads the entire index into memory + This is a hint to the database that the index will be accessed in the + future and should be loaded into memory if possible. This can reduce + cold-start latency for subsequent queries. - If the index does not fit into the available cache this call - may be wasteful + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + It is generally wasteful to call this if the index does not fit into the + available cache. Not all index types support prewarming; unsupported + indices will silently ignore the request. Parameters ---------- @@ -2233,6 +2239,29 @@ class LanceTable(Table): """ return LOOP.run(self._table.prewarm_index(name)) + def prewarm_data(self, columns: Optional[List[str]] = None) -> None: + """ + Prewarm data for the table. + + This is a hint to the database that the given columns will be accessed + in the future and the database should prefetch the data if possible. + Currently only supported on remote tables. + + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + This operation has a large upfront cost but can speed up future queries + that need to fetch the given columns. Large columns such as embeddings + or binary data may not be practical to prewarm. This feature is intended + for workloads that issue many queries against the same columns. + + Parameters + ---------- + columns: list of str, optional + The columns to prewarm. If None, all columns are prewarmed. + """ + return LOOP.run(self._table.prewarm_data(columns)) + def wait_for_index( self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300) ) -> None: @@ -3634,19 +3663,47 @@ class AsyncTable: """ Prewarm an index in the table. + This is a hint to the database that the index will be accessed in the + future and should be loaded into memory if possible. This can reduce + cold-start latency for subsequent queries. + + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + It is generally wasteful to call this if the index does not fit into the + available cache. Not all index types support prewarming; unsupported + indices will silently ignore the request. + Parameters ---------- name: str The name of the index to prewarm - - Notes - ----- - This will load the index into memory. This may reduce the cold-start time for - future queries. If the index does not fit in the cache then this call may be - wasteful. """ await self._inner.prewarm_index(name) + async def prewarm_data(self, columns: Optional[List[str]] = None) -> None: + """ + Prewarm data for the table. + + This is a hint to the database that the given columns will be accessed + in the future and the database should prefetch the data if possible. + Currently only supported on remote tables. + + This call initiates prewarming and returns once the request is accepted. + It is idempotent and safe to call from multiple clients concurrently. + + This operation has a large upfront cost but can speed up future queries + that need to fetch the given columns. Large columns such as embeddings + or binary data may not be practical to prewarm. This feature is intended + for workloads that issue many queries against the same columns. + + Parameters + ---------- + columns: list of str, optional + The columns to prewarm. If None, all columns are prewarmed. + """ + await self._inner.prewarm_data(columns) + async def wait_for_index( self, index_names: Iterable[str], timeout: timedelta = timedelta(seconds=300) ) -> None: diff --git a/python/src/table.rs b/python/src/table.rs index e988cadb4..00015bba8 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -426,6 +426,17 @@ impl Table { }) } + pub fn prewarm_data( + self_: PyRef<'_, Self>, + columns: Option>, + ) -> PyResult> { + let inner = self_.inner_ref()?.clone(); + future_into_py(self_.py(), async move { + inner.prewarm_data(columns).await.infer_error()?; + Ok(()) + }) + } + pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult> { let inner = self_.inner_ref()?.clone(); future_into_py(self_.py(), async move { diff --git a/rust/lancedb/src/remote/client.rs b/rust/lancedb/src/remote/client.rs index ac318c014..41a5949ca 100644 --- a/rust/lancedb/src/remote/client.rs +++ b/rust/lancedb/src/remote/client.rs @@ -426,14 +426,11 @@ impl RestfulLanceDbClient { })?, ); } - if db_prefix.is_some() { + if let Some(prefix) = db_prefix { headers.insert( HeaderName::from_static("x-lancedb-database-prefix"), - HeaderValue::from_str(db_prefix.unwrap()).map_err(|_| Error::InvalidInput { - message: format!( - "non-ascii database prefix '{}' provided", - db_prefix.unwrap() - ), + HeaderValue::from_str(prefix).map_err(|_| Error::InvalidInput { + message: format!("non-ascii database prefix '{}' provided", prefix), })?, ); } diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index a633e00ab..2532a9962 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -1645,10 +1645,33 @@ impl BaseTable for RemoteTable { Ok(()) } - async fn prewarm_index(&self, _index_name: &str) -> Result<()> { - Err(Error::NotSupported { - message: "prewarm_index is not yet supported on LanceDB cloud.".into(), - }) + async fn prewarm_index(&self, index_name: &str) -> Result<()> { + let request = self.client.post(&format!( + "/v1/table/{}/index/{}/prewarm/", + self.identifier, index_name + )); + let (request_id, response) = self.send(request, true).await?; + if response.status() == StatusCode::NOT_FOUND { + return Err(Error::IndexNotFound { + name: index_name.to_string(), + }); + } + self.check_table_response(&request_id, response).await?; + Ok(()) + } + + async fn prewarm_data(&self, columns: Option>) -> Result<()> { + let mut request = self.client.post(&format!( + "/v1/table/{}/page_cache/prewarm/", + self.identifier + )); + let body = serde_json::json!({ + "columns": columns.unwrap_or_default(), + }); + request = request.json(&body); + let (request_id, response) = self.send(request, true).await?; + self.check_table_response(&request_id, response).await?; + Ok(()) } async fn table_definition(&self) -> Result { @@ -3529,6 +3552,64 @@ mod tests { assert_eq!(result.version, if old_server { 0 } else { 43 }); } + #[tokio::test] + async fn test_prewarm_index() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!(request.method(), "POST"); + assert_eq!( + request.url().path(), + "/v1/table/my_table/index/my_index/prewarm/" + ); + http::Response::builder().status(200).body("{}").unwrap() + }); + table.prewarm_index("my_index").await.unwrap(); + } + + #[tokio::test] + async fn test_prewarm_index_not_found() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!( + request.url().path(), + "/v1/table/my_table/index/my_index/prewarm/" + ); + http::Response::builder().status(404).body("{}").unwrap() + }); + let e = table.prewarm_index("my_index").await.unwrap_err(); + assert!(matches!(e, Error::IndexNotFound { .. })); + } + + #[tokio::test] + async fn test_prewarm_data() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!(request.method(), "POST"); + assert_eq!( + request.url().path(), + "/v1/table/my_table/page_cache/prewarm/" + ); + http::Response::builder().status(200).body("{}").unwrap() + }); + table.prewarm_data(None).await.unwrap(); + } + + #[tokio::test] + async fn test_prewarm_data_with_columns() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!(request.method(), "POST"); + assert_eq!( + request.url().path(), + "/v1/table/my_table/page_cache/prewarm/" + ); + let body = request.body().unwrap().as_bytes().unwrap(); + let body: serde_json::Value = serde_json::from_slice(body).unwrap(); + assert_eq!(body["columns"], serde_json::json!(["col_a", "col_b"])); + http::Response::builder().status(200).body("{}").unwrap() + }); + table + .prewarm_data(Some(vec!["col_a".into(), "col_b".into()])) + .await + .unwrap(); + } + #[tokio::test] async fn test_drop_index() { let table = Table::new_with_handler("my_table", |request| { diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index d51a80369..3f88f8782 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -277,8 +277,13 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync { async fn list_indices(&self) -> Result>; /// Drop an index from the table. async fn drop_index(&self, name: &str) -> Result<()>; - /// Prewarm an index in the table + /// Prewarm an index in the table. async fn prewarm_index(&self, name: &str) -> Result<()>; + /// Prewarm data for the table. + /// + /// Currently only supported on remote tables. + /// If `columns` is `None`, all columns are prewarmed. + async fn prewarm_data(&self, columns: Option>) -> Result<()>; /// Get statistics about the index. async fn index_stats(&self, index_name: &str) -> Result>; /// Merge insert new records into the table. @@ -1123,22 +1128,45 @@ impl Table { self.inner.drop_index(name).await } - /// Prewarm an index in the table + /// Prewarm an index in the table. /// - /// This is a hint to fully load the index into memory. It can be used to - /// avoid cold starts + /// This is a hint to the database that the index will be accessed in the + /// future and should be loaded into memory if possible. This can reduce + /// cold-start latency for subsequent queries. + /// + /// This call initiates prewarming and returns once the request is accepted. + /// It is idempotent and safe to call from multiple clients concurrently. /// /// It is generally wasteful to call this if the index does not fit into the - /// available cache. - /// - /// Note: This function is not yet supported on all indices, in which case it - /// may do nothing. + /// available cache. Not all index types support prewarming; unsupported + /// indices will silently ignore the request. /// /// Use [`Self::list_indices()`] to find the names of the indices. pub async fn prewarm_index(&self, name: &str) -> Result<()> { self.inner.prewarm_index(name).await } + /// Prewarm data for the table. + /// + /// This is a hint to the database that the given columns will be accessed in + /// the future and the database should prefetch the data if possible. This + /// can reduce cold-start latency for subsequent queries. Currently only + /// supported on remote tables. + /// + /// This call initiates prewarming and returns once the request is accepted. + /// It is idempotent and safe to call from multiple clients concurrently — + /// calling it on already-prewarmed columns is a no-op on the server. + /// + /// This operation has a large upfront cost but can speed up future queries + /// that need to fetch the given columns. Large columns such as embeddings + /// or binary data may not be practical to prewarm. This feature is intended + /// for workloads that issue many queries against the same columns. + /// + /// If `columns` is `None`, all columns are prewarmed. + pub async fn prewarm_data(&self, columns: Option>) -> Result<()> { + self.inner.prewarm_data(columns).await + } + /// Poll until the columns are fully indexed. Will return Error::Timeout if the columns /// are not fully indexed within the timeout. pub async fn wait_for_index( @@ -2290,6 +2318,12 @@ impl BaseTable for NativeTable { Ok(dataset.prewarm_index(index_name).await?) } + async fn prewarm_data(&self, _columns: Option>) -> Result<()> { + Err(Error::NotSupported { + message: "prewarm_data is currently only supported on remote tables.".into(), + }) + } + async fn update(&self, update: UpdateBuilder) -> Result { // Delegate to the submodule implementation update::execute_update(self, update).await