From 3d7c48fecad4872d3d6ceef4188560b5bb45796a Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 26 Apr 2024 13:42:02 -0700 Subject: [PATCH] feat: allow the index_cache_size to be configured when opening a table (#1245) This was already configurable in the rust API but it wasn't actually being passed down to the underlying dataset. I added this option to both the async python API and the new nodejs API. I also added this option to the synchronous python API. I did not add the option to vectordb. --- nodejs/lancedb/connection.ts | 13 +++++++++++ nodejs/src/connection.rs | 4 ++++ python/python/lancedb/db.py | 35 +++++++++++++++++++++++++----- python/python/lancedb/remote/db.py | 8 ++++++- python/python/lancedb/table.py | 14 ++++++++++-- python/python/tests/test_db.py | 13 +++++++++++ python/src/connection.rs | 6 ++++- rust/lancedb/src/connection.rs | 13 ++++++++++- 8 files changed, 96 insertions(+), 10 deletions(-) diff --git a/nodejs/lancedb/connection.ts b/nodejs/lancedb/connection.ts index 8bc8b7f5..41446be2 100644 --- a/nodejs/lancedb/connection.ts +++ b/nodejs/lancedb/connection.ts @@ -77,6 +77,18 @@ export interface OpenTableOptions { * The available options are described at https://lancedb.github.io/lancedb/guides/storage/ */ storageOptions?: Record; + /** + * Set the size of the index cache, specified as a number of entries + * + * The exact meaning of an "entry" will depend on the type of index: + * - IVF: there is one entry for each IVF partition + * - BTREE: there is one entry for the entire index + * + * This cache applies to the entire opened table, across all indices. + * Setting this value higher will increase performance on larger datasets + * at the expense of more RAM + */ + indexCacheSize?: number; } export interface TableNamesOptions { @@ -160,6 +172,7 @@ export class Connection { const innerTable = await this.inner.openTable( name, cleanseStorageOptions(options?.storageOptions), + options?.indexCacheSize, ); return new Table(innerTable); } diff --git a/nodejs/src/connection.rs b/nodejs/src/connection.rs index 6ce4dfe2..5b45a580 100644 --- a/nodejs/src/connection.rs +++ b/nodejs/src/connection.rs @@ -176,6 +176,7 @@ impl Connection { &self, name: String, storage_options: Option>, + index_cache_size: Option, ) -> napi::Result { let mut builder = self.get_inner()?.open_table(&name); if let Some(storage_options) = storage_options { @@ -183,6 +184,9 @@ impl Connection { builder = builder.storage_option(key, value); } } + if let Some(index_cache_size) = index_cache_size { + builder = builder.index_cache_size(index_cache_size); + } let tbl = builder .execute() .await diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py index 204edc87..d5d294ac 100644 --- a/python/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -224,13 +224,23 @@ class DBConnection(EnforceOverrides): def __getitem__(self, name: str) -> LanceTable: return self.open_table(name) - def open_table(self, name: str) -> Table: + def open_table(self, name: str, *, index_cache_size: Optional[int] = None) -> Table: """Open a Lance Table in the database. Parameters ---------- name: str The name of the table. + index_cache_size: int, default 256 + Set the size of the index cache, specified as a number of entries + + The exact meaning of an "entry" will depend on the type of index: + * IVF - there is one entry for each IVF partition + * BTREE - there is one entry for the entire index + + This cache applies to the entire opened table, across all indices. + Setting this value higher will increase performance on larger datasets + at the expense of more RAM Returns ------- @@ -419,7 +429,9 @@ class LanceDBConnection(DBConnection): return tbl @override - def open_table(self, name: str) -> LanceTable: + def open_table( + self, name: str, *, index_cache_size: Optional[int] = None + ) -> LanceTable: """Open a table in the database. Parameters @@ -431,7 +443,7 @@ class LanceDBConnection(DBConnection): ------- A LanceTable object representing the table. """ - return LanceTable.open(self, name) + return LanceTable.open(self, name, index_cache_size=index_cache_size) @override def drop_table(self, name: str, ignore_missing: bool = False): @@ -763,7 +775,10 @@ class AsyncConnection(object): return AsyncTable(new_table) async def open_table( - self, name: str, storage_options: Optional[Dict[str, str]] = None + self, + name: str, + storage_options: Optional[Dict[str, str]] = None, + index_cache_size: Optional[int] = None, ) -> Table: """Open a Lance Table in the database. @@ -776,12 +791,22 @@ class AsyncConnection(object): connection will be inherited by the table, but can be overridden here. See available options at https://lancedb.github.io/lancedb/guides/storage/ + index_cache_size: int, default 256 + Set the size of the index cache, specified as a number of entries + + The exact meaning of an "entry" will depend on the type of index: + * IVF - there is one entry for each IVF partition + * BTREE - there is one entry for the entire index + + This cache applies to the entire opened table, across all indices. + Setting this value higher will increase performance on larger datasets + at the expense of more RAM Returns ------- A LanceTable object representing the table. """ - table = await self._inner.open_table(name, storage_options) + table = await self._inner.open_table(name, storage_options, index_cache_size) return AsyncTable(table) async def drop_table(self, name: str): diff --git a/python/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py index 3809c511..3b0fc3bd 100644 --- a/python/python/lancedb/remote/db.py +++ b/python/python/lancedb/remote/db.py @@ -94,7 +94,7 @@ class RemoteDBConnection(DBConnection): yield item @override - def open_table(self, name: str) -> Table: + def open_table(self, name: str, *, index_cache_size: Optional[int] = None) -> Table: """Open a Lance Table in the database. Parameters @@ -110,6 +110,12 @@ class RemoteDBConnection(DBConnection): self._client.mount_retry_adapter_for_table(name) + if index_cache_size is not None: + logging.info( + "index_cache_size is ignored in LanceDb Cloud" + " (there is no local cache to configure)" + ) + # check if table exists if self._table_cache.get(name) is None: self._client.post(f"/v1/table/{name}/describe/") diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 58d0d0bf..8c630601 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -806,6 +806,7 @@ class _LanceLatestDatasetRef(_LanceDatasetRef): """Reference to the latest version of a LanceDataset.""" uri: str + index_cache_size: Optional[int] = None read_consistency_interval: Optional[timedelta] = None last_consistency_check: Optional[float] = None _dataset: Optional[LanceDataset] = None @@ -813,7 +814,9 @@ class _LanceLatestDatasetRef(_LanceDatasetRef): @property def dataset(self) -> LanceDataset: if not self._dataset: - self._dataset = lance.dataset(self.uri) + self._dataset = lance.dataset( + self.uri, index_cache_size=self.index_cache_size + ) self.last_consistency_check = time.monotonic() elif self.read_consistency_interval is not None: now = time.monotonic() @@ -842,12 +845,15 @@ class _LanceLatestDatasetRef(_LanceDatasetRef): class _LanceTimeTravelRef(_LanceDatasetRef): uri: str version: int + index_cache_size: Optional[int] = None _dataset: Optional[LanceDataset] = None @property def dataset(self) -> LanceDataset: if not self._dataset: - self._dataset = lance.dataset(self.uri, version=self.version) + self._dataset = lance.dataset( + self.uri, version=self.version, index_cache_size=self.index_cache_size + ) return self._dataset @dataset.setter @@ -884,6 +890,8 @@ class LanceTable(Table): connection: "LanceDBConnection", name: str, version: Optional[int] = None, + *, + index_cache_size: Optional[int] = None, ): self._conn = connection self.name = name @@ -892,11 +900,13 @@ class LanceTable(Table): self._ref = _LanceTimeTravelRef( uri=self._dataset_uri, version=version, + index_cache_size=index_cache_size, ) else: self._ref = _LanceLatestDatasetRef( uri=self._dataset_uri, read_consistency_interval=connection.read_consistency_interval, + index_cache_size=index_cache_size, ) @classmethod diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py index 82b90c0a..40fc4998 100644 --- a/python/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -368,6 +368,15 @@ async def test_create_exist_ok_async(tmp_path): # await db.create_table("test", schema=bad_schema, exist_ok=True) +def test_open_table_sync(tmp_path): + db = lancedb.connect(tmp_path) + db.create_table("test", data=[{"id": 0}]) + assert db.open_table("test").count_rows() == 1 + assert db.open_table("test", index_cache_size=0).count_rows() == 1 + with pytest.raises(FileNotFoundError, match="does not exist"): + db.open_table("does_not_exist") + + @pytest.mark.asyncio async def test_open_table(tmp_path): db = await lancedb.connect_async(tmp_path) @@ -397,6 +406,10 @@ async def test_open_table(tmp_path): } ) + # No way to verify this yet, but at least make sure we + # can pass the parameter + await db.open_table("test", index_cache_size=0) + with pytest.raises(ValueError, match="was not found"): await db.open_table("does_not_exist") diff --git a/python/src/connection.rs b/python/src/connection.rs index 0ffe9eae..bf55facd 100644 --- a/python/src/connection.rs +++ b/python/src/connection.rs @@ -134,17 +134,21 @@ impl Connection { }) } - #[pyo3(signature = (name, storage_options = None))] + #[pyo3(signature = (name, storage_options = None, index_cache_size = None))] pub fn open_table( self_: PyRef<'_, Self>, name: String, storage_options: Option>, + index_cache_size: Option, ) -> PyResult<&PyAny> { let inner = self_.get_inner()?.clone(); let mut builder = inner.open_table(name); if let Some(storage_options) = storage_options { builder = builder.storage_options(storage_options); } + if let Some(index_cache_size) = index_cache_size { + builder = builder.index_cache_size(index_cache_size); + } future_into_py(self_.py(), async move { let table = builder.execute().await.infer_error()?; Ok(Table::new(table)) diff --git a/rust/lancedb/src/connection.rs b/rust/lancedb/src/connection.rs index 0dab78c9..26e345ce 100644 --- a/rust/lancedb/src/connection.rs +++ b/rust/lancedb/src/connection.rs @@ -913,12 +913,23 @@ impl ConnectionInternal for Database { } } + // Some ReadParams are exposed in the OpenTableBuilder, but we also + // let the user provide their own ReadParams. + // + // If we have a user provided ReadParams use that + // If we don't then start with the default ReadParams and customize it with + // the options from the OpenTableBuilder + let read_params = options.lance_read_params.unwrap_or_else(|| ReadParams { + index_cache_size: options.index_cache_size as usize, + ..Default::default() + }); + let native_table = Arc::new( NativeTable::open_with_params( &table_uri, &options.name, self.store_wrapper.clone(), - options.lance_read_params, + Some(read_params), self.read_consistency_interval, ) .await?,