Compare commits

...

7 Commits

Author SHA1 Message Date
Lance Release
1090c311e8 [python] Bump version: 0.6.10 → 0.6.11 2024-04-27 03:54:58 +00:00
Weston Pace
e767cbb374 chore: update to Lance version 0.10.16 and Arrow version 51 (#1247) 2024-04-26 16:26:57 -07:00
Weston Pace
3d7c48feca feat: allow the index_cache_size to be configured when opening a table (#1245)
This was already configurable in the rust API but it wasn't actually
being passed down to the underlying dataset. I added this option to both
the async python API and the new nodejs API.

I also added this option to the synchronous python API.

I did not add the option to vectordb.
2024-04-26 13:42:02 -07:00
Bert
08d62550bb fix: passing data to createTable as option (#1242)
Fixes issue where we would throw `Either data or schema needs to
defined` when passing `data` to `createTable` as a property of the first
argument (an object).

```ts
await db.createTable({
  name: 'table1',
  data,
  schema
})
```
2024-04-26 15:26:08 -04:00
Lei Xu
b272408b05 chore: fix main branch test failure (#1240) 2024-04-24 13:49:37 -07:00
Weston Pace
46ffa87cd4 chore: disable the remote feature by default (#1239)
The rust implementation of the remote client is not yet ready. This is
understandably confusing for users since it is enabled by default. This
PR disables it by default. We can re-enable it when we are ready (even
then it is not clear this is something that should be a default
feature).

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2024-04-24 09:28:24 -07:00
QianZhu
cd9fc37b95 add rename_table fn and more data for index_stats to return (#1234)
1. added rename_table fn to enable dashboard to rename a table
2. added index_type and distance_type (for vector index) to index_stats
so that more detailed data can be shown on the table page.
2024-04-23 16:42:26 -07:00
19 changed files with 199 additions and 31 deletions

View File

@@ -14,19 +14,19 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.10.15", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.10.15" }
lance-linalg = { "version" = "=0.10.15" }
lance-testing = { "version" = "=0.10.15" }
lance = { "version" = "=0.10.16", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.10.16" }
lance-linalg = { "version" = "=0.10.16" }
lance-testing = { "version" = "=0.10.16" }
# Note that this one does not include pyarrow
arrow = { version = "50.0", optional = false }
arrow-array = "50.0"
arrow-data = "50.0"
arrow-ipc = "50.0"
arrow-ord = "50.0"
arrow-schema = "50.0"
arrow-arith = "50.0"
arrow-cast = "50.0"
arrow = { version = "51.0", optional = false }
arrow-array = "51.0"
arrow-data = "51.0"
arrow-ipc = "51.0"
arrow-ord = "51.0"
arrow-schema = "51.0"
arrow-arith = "51.0"
arrow-cast = "51.0"
async-trait = "0"
chrono = "0.4.35"
half = { "version" = "=2.3.1", default-features = false, features = [

View File

@@ -140,6 +140,9 @@ export class RemoteConnection implements Connection {
schema = nameOrOpts.schema
embeddings = nameOrOpts.embeddingFunction
tableName = nameOrOpts.name
if (data === undefined) {
data = nameOrOpts.data
}
}
let buffer: Buffer

View File

@@ -77,6 +77,18 @@ export interface OpenTableOptions {
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
*/
storageOptions?: Record<string, string>;
/**
* Set the size of the index cache, specified as a number of entries
*
* The exact meaning of an "entry" will depend on the type of index:
* - IVF: there is one entry for each IVF partition
* - BTREE: there is one entry for the entire index
*
* This cache applies to the entire opened table, across all indices.
* Setting this value higher will increase performance on larger datasets
* at the expense of more RAM
*/
indexCacheSize?: number;
}
export interface TableNamesOptions {
@@ -160,6 +172,7 @@ export class Connection {
const innerTable = await this.inner.openTable(
name,
cleanseStorageOptions(options?.storageOptions),
options?.indexCacheSize,
);
return new Table(innerTable);
}

View File

@@ -176,6 +176,7 @@ impl Connection {
&self,
name: String,
storage_options: Option<HashMap<String, String>>,
index_cache_size: Option<u32>,
) -> napi::Result<Table> {
let mut builder = self.get_inner()?.open_table(&name);
if let Some(storage_options) = storage_options {
@@ -183,6 +184,9 @@ impl Connection {
builder = builder.storage_option(key, value);
}
}
if let Some(index_cache_size) = index_cache_size {
builder = builder.index_cache_size(index_cache_size);
}
let tbl = builder
.execute()
.await

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.10
current_version = 0.6.11
commit = True
message = [python] Bump version: {current_version} → {new_version}
tag = True

View File

@@ -14,7 +14,7 @@ name = "_lancedb"
crate-type = ["cdylib"]
[dependencies]
arrow = { version = "50.0.0", features = ["pyarrow"] }
arrow = { version = "51.0.0", features = ["pyarrow"] }
lancedb = { path = "../rust/lancedb" }
env_logger = "0.10"
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }

View File

@@ -1,6 +1,6 @@
[project]
name = "lancedb"
version = "0.6.10"
version = "0.6.11"
dependencies = [
"deprecation",
"pylance==0.10.12",

View File

@@ -224,13 +224,23 @@ class DBConnection(EnforceOverrides):
def __getitem__(self, name: str) -> LanceTable:
return self.open_table(name)
def open_table(self, name: str) -> Table:
def open_table(self, name: str, *, index_cache_size: Optional[int] = None) -> Table:
"""Open a Lance Table in the database.
Parameters
----------
name: str
The name of the table.
index_cache_size: int, default 256
Set the size of the index cache, specified as a number of entries
The exact meaning of an "entry" will depend on the type of index:
* IVF - there is one entry for each IVF partition
* BTREE - there is one entry for the entire index
This cache applies to the entire opened table, across all indices.
Setting this value higher will increase performance on larger datasets
at the expense of more RAM
Returns
-------
@@ -248,6 +258,18 @@ class DBConnection(EnforceOverrides):
"""
raise NotImplementedError
def rename_table(self, cur_name: str, new_name: str):
"""Rename a table in the database.
Parameters
----------
cur_name: str
The current name of the table.
new_name: str
The new name of the table.
"""
raise NotImplementedError
def drop_database(self):
"""
Drop database
@@ -407,7 +429,9 @@ class LanceDBConnection(DBConnection):
return tbl
@override
def open_table(self, name: str) -> LanceTable:
def open_table(
self, name: str, *, index_cache_size: Optional[int] = None
) -> LanceTable:
"""Open a table in the database.
Parameters
@@ -419,7 +443,7 @@ class LanceDBConnection(DBConnection):
-------
A LanceTable object representing the table.
"""
return LanceTable.open(self, name)
return LanceTable.open(self, name, index_cache_size=index_cache_size)
@override
def drop_table(self, name: str, ignore_missing: bool = False):
@@ -751,7 +775,10 @@ class AsyncConnection(object):
return AsyncTable(new_table)
async def open_table(
self, name: str, storage_options: Optional[Dict[str, str]] = None
self,
name: str,
storage_options: Optional[Dict[str, str]] = None,
index_cache_size: Optional[int] = None,
) -> Table:
"""Open a Lance Table in the database.
@@ -764,12 +791,22 @@ class AsyncConnection(object):
connection will be inherited by the table, but can be overridden here.
See available options at
https://lancedb.github.io/lancedb/guides/storage/
index_cache_size: int, default 256
Set the size of the index cache, specified as a number of entries
The exact meaning of an "entry" will depend on the type of index:
* IVF - there is one entry for each IVF partition
* BTREE - there is one entry for the entire index
This cache applies to the entire opened table, across all indices.
Setting this value higher will increase performance on larger datasets
at the expense of more RAM
Returns
-------
A LanceTable object representing the table.
"""
table = await self._inner.open_table(name, storage_options)
table = await self._inner.open_table(name, storage_options, index_cache_size)
return AsyncTable(table)
async def drop_table(self, name: str):

View File

@@ -94,7 +94,7 @@ class RemoteDBConnection(DBConnection):
yield item
@override
def open_table(self, name: str) -> Table:
def open_table(self, name: str, *, index_cache_size: Optional[int] = None) -> Table:
"""Open a Lance Table in the database.
Parameters
@@ -110,6 +110,12 @@ class RemoteDBConnection(DBConnection):
self._client.mount_retry_adapter_for_table(name)
if index_cache_size is not None:
logging.info(
"index_cache_size is ignored in LanceDb Cloud"
" (there is no local cache to configure)"
)
# check if table exists
if self._table_cache.get(name) is None:
self._client.post(f"/v1/table/{name}/describe/")
@@ -281,6 +287,24 @@ class RemoteDBConnection(DBConnection):
)
self._table_cache.pop(name)
@override
def rename_table(self, cur_name: str, new_name: str):
"""Rename a table in the database.
Parameters
----------
cur_name: str
The current name of the table.
new_name: str
The new name of the table.
"""
self._client.post(
f"/v1/table/{cur_name}/rename/",
json={"new_table_name": new_name},
)
self._table_cache.pop(cur_name)
self._table_cache[new_name] = True
async def close(self):
"""Close the connection to the database."""
self._client.close()

View File

@@ -72,7 +72,7 @@ class RemoteTable(Table):
return resp
def index_stats(self, index_uuid: str):
"""List all the indices on the table"""
"""List all the stats of a specified index"""
resp = self._conn._client.post(
f"/v1/table/{self._name}/index/{index_uuid}/stats/"
)

View File

@@ -806,6 +806,7 @@ class _LanceLatestDatasetRef(_LanceDatasetRef):
"""Reference to the latest version of a LanceDataset."""
uri: str
index_cache_size: Optional[int] = None
read_consistency_interval: Optional[timedelta] = None
last_consistency_check: Optional[float] = None
_dataset: Optional[LanceDataset] = None
@@ -813,7 +814,9 @@ class _LanceLatestDatasetRef(_LanceDatasetRef):
@property
def dataset(self) -> LanceDataset:
if not self._dataset:
self._dataset = lance.dataset(self.uri)
self._dataset = lance.dataset(
self.uri, index_cache_size=self.index_cache_size
)
self.last_consistency_check = time.monotonic()
elif self.read_consistency_interval is not None:
now = time.monotonic()
@@ -842,12 +845,15 @@ class _LanceLatestDatasetRef(_LanceDatasetRef):
class _LanceTimeTravelRef(_LanceDatasetRef):
uri: str
version: int
index_cache_size: Optional[int] = None
_dataset: Optional[LanceDataset] = None
@property
def dataset(self) -> LanceDataset:
if not self._dataset:
self._dataset = lance.dataset(self.uri, version=self.version)
self._dataset = lance.dataset(
self.uri, version=self.version, index_cache_size=self.index_cache_size
)
return self._dataset
@dataset.setter
@@ -884,6 +890,8 @@ class LanceTable(Table):
connection: "LanceDBConnection",
name: str,
version: Optional[int] = None,
*,
index_cache_size: Optional[int] = None,
):
self._conn = connection
self.name = name
@@ -892,11 +900,13 @@ class LanceTable(Table):
self._ref = _LanceTimeTravelRef(
uri=self._dataset_uri,
version=version,
index_cache_size=index_cache_size,
)
else:
self._ref = _LanceLatestDatasetRef(
uri=self._dataset_uri,
read_consistency_interval=connection.read_consistency_interval,
index_cache_size=index_cache_size,
)
@classmethod

View File

@@ -368,6 +368,15 @@ async def test_create_exist_ok_async(tmp_path):
# await db.create_table("test", schema=bad_schema, exist_ok=True)
def test_open_table_sync(tmp_path):
db = lancedb.connect(tmp_path)
db.create_table("test", data=[{"id": 0}])
assert db.open_table("test").count_rows() == 1
assert db.open_table("test", index_cache_size=0).count_rows() == 1
with pytest.raises(FileNotFoundError, match="does not exist"):
db.open_table("does_not_exist")
@pytest.mark.asyncio
async def test_open_table(tmp_path):
db = await lancedb.connect_async(tmp_path)
@@ -397,6 +406,10 @@ async def test_open_table(tmp_path):
}
)
# No way to verify this yet, but at least make sure we
# can pass the parameter
await db.open_table("test", index_cache_size=0)
with pytest.raises(ValueError, match="was not found"):
await db.open_table("does_not_exist")

View File

@@ -134,17 +134,21 @@ impl Connection {
})
}
#[pyo3(signature = (name, storage_options = None))]
#[pyo3(signature = (name, storage_options = None, index_cache_size = None))]
pub fn open_table(
self_: PyRef<'_, Self>,
name: String,
storage_options: Option<HashMap<String, String>>,
index_cache_size: Option<u32>,
) -> PyResult<&PyAny> {
let inner = self_.get_inner()?.clone();
let mut builder = inner.open_table(name);
if let Some(storage_options) = storage_options {
builder = builder.storage_options(storage_options);
}
if let Some(index_cache_size) = index_cache_size {
builder = builder.index_cache_size(index_cache_size);
}
future_into_py(self_.py(), async move {
let table = builder.execute().await.infer_error()?;
Ok(Table::new(table))

View File

@@ -52,7 +52,7 @@ aws-sdk-kms = { version = "1.0" }
aws-config = { version = "1.0" }
[features]
default = ["remote"]
default = []
remote = ["dep:reqwest"]
fp16kernels = ["lance-linalg/fp16kernels"]
s3-test = []
s3-test = []

View File

@@ -33,6 +33,9 @@ use crate::table::{NativeTable, WriteOptions};
use crate::utils::validate_table_name;
use crate::Table;
#[cfg(feature = "remote")]
use log::warn;
pub const LANCE_FILE_EXTENSION: &str = "lance";
pub type TableBuilderCallback = Box<dyn FnOnce(OpenTableBuilder) -> OpenTableBuilder + Send>;
@@ -579,6 +582,7 @@ impl ConnectBuilder {
let api_key = self.api_key.ok_or_else(|| Error::InvalidInput {
message: "An api_key is required when connecting to LanceDb Cloud".to_string(),
})?;
warn!("The rust implementation of the remote client is not yet ready for use.");
let internal = Arc::new(crate::remote::db::RemoteDatabase::try_new(
&self.uri,
&api_key,
@@ -909,12 +913,23 @@ impl ConnectionInternal for Database {
}
}
// Some ReadParams are exposed in the OpenTableBuilder, but we also
// let the user provide their own ReadParams.
//
// If we have a user provided ReadParams use that
// If we don't then start with the default ReadParams and customize it with
// the options from the OpenTableBuilder
let read_params = options.lance_read_params.unwrap_or_else(|| ReadParams {
index_cache_size: options.index_cache_size as usize,
..Default::default()
});
let native_table = Arc::new(
NativeTable::open_with_params(
&table_uri,
&options.name,
self.store_wrapper.clone(),
options.lance_read_params,
Some(read_params),
self.read_consistency_interval,
)
.await?,
@@ -1032,7 +1047,6 @@ mod tests {
}
#[tokio::test]
#[ignore = "this can't pass due to https://github.com/lancedb/lancedb/issues/1019, enable it after the bug fixed"]
async fn test_open_table() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();

View File

@@ -46,10 +46,18 @@ impl VectorIndex {
}
}
#[derive(Debug, Deserialize)]
pub struct VectorIndexMetadata {
pub metric_type: String,
pub index_type: String,
}
#[derive(Debug, Deserialize)]
pub struct VectorIndexStatistics {
pub num_indexed_rows: usize,
pub num_unindexed_rows: usize,
pub index_type: String,
pub indices: Vec<VectorIndexMetadata>,
}
/// Builder for an IVF PQ index.

View File

@@ -350,8 +350,16 @@ mod test {
#[tokio::test]
async fn test_e2e() {
let dir1 = tempfile::tempdir().unwrap().into_path();
let dir2 = tempfile::tempdir().unwrap().into_path();
let dir1 = tempfile::tempdir()
.unwrap()
.into_path()
.canonicalize()
.unwrap();
let dir2 = tempfile::tempdir()
.unwrap()
.into_path()
.canonicalize()
.unwrap();
let secondary_store = LocalFileSystem::new_with_prefix(dir2.to_str().unwrap()).unwrap();
let object_store_wrapper = Arc::new(MirroringObjectStoreWrapper {

View File

@@ -34,6 +34,16 @@
//! cargo install lancedb
//! ```
//!
//! ## Crate Features
//!
//! ### Experimental Features
//!
//! These features are not enabled by default. They are experimental or in-development features that
//! are not yet ready to be released.
//!
//! - `remote` - Enable remote client to connect to LanceDB cloud. This is not yet fully implemented
//! and should not be enabled.
//!
//! ### Quick Start
//!
//! #### Connect to a database.

View File

@@ -1061,6 +1061,26 @@ impl NativeTable {
}
}
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.index_type)),
None => Ok(None),
}
}
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(
stats
.indices
.iter()
.map(|i| i.metric_type.clone())
.collect(),
)),
None => Ok(None),
}
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let dataset = self.dataset.get().await?;
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;