Compare commits

...

7 Commits

Author SHA1 Message Date
Lance Release
247fb58400 Bump version: 0.25.1 → 0.25.2-beta.0 2025-09-24 22:54:09 +00:00
Jack Ye
504bdc471c feat(rust): support namespace backed database (#2664)
This PR adds support for namespace-backed databases through
lance-namespace integration, enabling centralized table management
through namespace APIs.

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-09-24 15:33:31 -07:00
Will Jones
d617cdef4a feat: add use_index parameter to merge insert operations (#2674)
## Summary

Exposes `use_index` Merge Insert parameter, which was created upstream
in https://github.com/lancedb/lance/pull/4688.

## API Examples

### Python
```python
# Force table scan
table.merge_insert(["id"]) \
    .when_not_matched_insert_all() \
    .use_index(False) \
    .execute(data)
```

### Node.js/TypeScript
```typescript
// Force table scan  
await table.mergeInsert("id")
    .whenNotMatchedInsertAll()
    .useIndex(false)
    .execute(data);
```

### Rust
```rust
// Force table scan
let mut builder = table.merge_insert(&["id"]);
builder.when_not_matched_insert_all()
       .use_index(false);
builder.execute(data).await?;
```

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Claude <noreply@anthropic.com>
2025-09-24 12:50:21 -07:00
Will Jones
356d7046fd ci: fix test failure on main (#2677)
Test was in wrong position.
2025-09-24 09:46:04 -07:00
Will Jones
48e5caabda ci(nodejs): lint for unused imports (#2673) 2025-09-23 18:49:42 -07:00
Lance Release
d6cc68f671 Bump version: 0.22.1-beta.4 → 0.22.1 2025-09-23 22:07:31 +00:00
Lance Release
55eacfa685 Bump version: 0.22.1-beta.3 → 0.22.1-beta.4 2025-09-23 22:06:45 +00:00
44 changed files with 2487 additions and 301 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.22.1-beta.3"
current_version = "0.22.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -116,7 +116,7 @@ jobs:
set -e
npm ci
npm run docs
if ! git diff --exit-code -- . ':(exclude)Cargo.lock'; then
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
echo "Docs need to be updated"
echo "Run 'npm run docs', fix any warnings, and commit the changes."
exit 1

1472
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -23,6 +23,7 @@ lance-table = "=0.37.0"
lance-testing = "=0.37.0"
lance-datafusion = "=0.37.0"
lance-encoding = "=0.37.0"
lance-namespace = "0.0.15"
# Note that this one does not include pyarrow
arrow = { version = "55.1", optional = false }
arrow-array = "55.1"

View File

@@ -25,6 +25,51 @@ the underlying connection has been closed.
## Methods
### cloneTable()
```ts
abstract cloneTable(
targetTableName,
sourceUri,
options?): Promise<Table>
```
Clone a table from a source table.
A shallow clone creates a new table that shares the underlying data files
with the source table but has its own independent manifest. This allows
both the source and cloned tables to evolve independently while initially
sharing the same data, deletion, and index files.
#### Parameters
* **targetTableName**: `string`
The name of the target table to create.
* **sourceUri**: `string`
The URI of the source table to clone from.
* **options?**
Clone options.
* **options.isShallow?**: `boolean`
Whether to perform a shallow clone (defaults to true).
* **options.sourceTag?**: `string`
The tag of the source table to clone.
* **options.sourceVersion?**: `number`
The version of the source table to clone.
* **options.targetNamespace?**: `string`[]
The namespace for the target table (defaults to root namespace).
#### Returns
`Promise`&lt;[`Table`](Table.md)&gt;
***
### close()
```ts

View File

@@ -13,7 +13,7 @@ function makeArrowTable(
metadata?): ArrowTable
```
An enhanced version of the makeTable function from Apache Arrow
An enhanced version of the apache-arrow makeTable function from Apache Arrow
that supports nested fields and embeddings columns.
(typically you do not need to call this function. It will be called automatically

View File

@@ -78,6 +78,7 @@
- [TableNamesOptions](interfaces/TableNamesOptions.md)
- [TableStatistics](interfaces/TableStatistics.md)
- [TimeoutConfig](interfaces/TimeoutConfig.md)
- [TlsConfig](interfaces/TlsConfig.md)
- [TokenResponse](interfaces/TokenResponse.md)
- [UpdateOptions](interfaces/UpdateOptions.md)
- [UpdateResult](interfaces/UpdateResult.md)

View File

@@ -40,6 +40,14 @@ optional timeoutConfig: TimeoutConfig;
***
### tlsConfig?
```ts
optional tlsConfig: TlsConfig;
```
***
### userAgent?
```ts

View File

@@ -0,0 +1,49 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / TlsConfig
# Interface: TlsConfig
TLS/mTLS configuration for the remote HTTP client.
## Properties
### assertHostname?
```ts
optional assertHostname: boolean;
```
Whether to verify the hostname in the server's certificate.
***
### certFile?
```ts
optional certFile: string;
```
Path to the client certificate file (PEM format) for mTLS authentication.
***
### keyFile?
```ts
optional keyFile: string;
```
Path to the client private key file (PEM format) for mTLS authentication.
***
### sslCaCert?
```ts
optional sslCaCert: string;
```
Path to the CA certificate file (PEM format) for server verification.

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.3</version>
<version>0.22.1-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.3</version>
<version>0.22.1-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.3</version>
<version>0.22.1-final.0</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.22.1-beta.3"
version = "0.22.1"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -1,17 +1,5 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import {
Bool,
Field,
Int32,
List,
Schema,
Struct,
Uint8,
Utf8,
} from "apache-arrow";
import * as arrow15 from "apache-arrow-15";
import * as arrow16 from "apache-arrow-16";
import * as arrow17 from "apache-arrow-17";
@@ -25,11 +13,9 @@ import {
fromTableToBuffer,
makeArrowTable,
makeEmptyTable,
tableFromIPC,
} from "../lancedb/arrow";
import {
EmbeddingFunction,
FieldOptions,
FunctionOptions,
} from "../lancedb/embedding/embedding_function";
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
@@ -1037,35 +1023,35 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(table.getChild("test")!.get(2)).toBe(false);
});
});
// Test for the undefined values bug fix
describe("undefined values handling", () => {
it("should handle mixed undefined and actual values", () => {
const schema = new Schema([
new Field("text", new Utf8(), true), // nullable
new Field("number", new Int32(), true), // nullable
new Field("bool", new Bool(), true), // nullable
]);
const data = [
{ text: undefined, number: 42, bool: true },
{ text: "hello", number: undefined, bool: false },
{ text: "world", number: 123, bool: undefined },
];
const table = makeArrowTable(data, { schema });
const result = table.toArray();
expect(result).toHaveLength(3);
expect(result[0].text).toBe(null);
expect(result[0].number).toBe(42);
expect(result[0].bool).toBe(true);
expect(result[1].text).toBe("hello");
expect(result[1].number).toBe(null);
expect(result[1].bool).toBe(false);
expect(result[2].text).toBe("world");
expect(result[2].number).toBe(123);
expect(result[2].bool).toBe(null);
});
});
},
);
// Test for the undefined values bug fix
describe("undefined values handling", () => {
it("should handle mixed undefined and actual values", () => {
const schema = new Schema([
new Field("text", new Utf8(), true), // nullable
new Field("number", new Int32(), true), // nullable
new Field("bool", new Bool(), true), // nullable
]);
const data = [
{ text: undefined, number: 42, bool: true },
{ text: "hello", number: undefined, bool: false },
{ text: "world", number: 123, bool: undefined },
];
const table = makeArrowTable(data, { schema });
const result = table.toArray();
expect(result).toHaveLength(3);
expect(result[0].text).toBe(null);
expect(result[0].number).toBe(42);
expect(result[0].bool).toBe(true);
expect(result[1].text).toBe("hello");
expect(result[1].number).toBe(null);
expect(result[1].bool).toBe(false);
expect(result[2].text).toBe("world");
expect(result[2].number).toBe(123);
expect(result[2].bool).toBe(null);
});
});

View File

@@ -7,7 +7,6 @@ import {
ClientConfig,
Connection,
ConnectionOptions,
NativeJsHeaderProvider,
TlsConfig,
connect,
} from "../lancedb";

View File

@@ -39,7 +39,6 @@ import {
Operator,
instanceOfFullTextQuery,
} from "../lancedb/query";
import exp = require("constants");
describe.each([arrow15, arrow16, arrow17, arrow18])(
"Given a table",
@@ -488,6 +487,32 @@ describe("merge insert", () => {
.execute(newData, { timeoutMs: 0 }),
).rejects.toThrow("merge insert timed out");
});
test("useIndex", async () => {
const newData = [
{ a: 2, b: "x" },
{ a: 4, b: "z" },
];
// Test with useIndex(true) - should work fine
const result1 = await table
.mergeInsert("a")
.whenNotMatchedInsertAll()
.useIndex(true)
.execute(newData);
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
// Test with useIndex(false) - should also work fine
const newData2 = [{ a: 5, b: "w" }];
const result2 = await table
.mergeInsert("a")
.whenNotMatchedInsertAll()
.useIndex(false)
.execute(newData2);
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
});
});
describe("When creating an index", () => {

View File

@@ -48,6 +48,7 @@
"noUnreachableSuper": "error",
"noUnsafeFinally": "error",
"noUnsafeOptionalChaining": "error",
"noUnusedImports": "error",
"noUnusedLabels": "error",
"noUnusedVariables": "warn",
"useIsNan": "error",

View File

@@ -41,7 +41,6 @@ import {
vectorFromArray as badVectorFromArray,
makeBuilder,
makeData,
makeTable,
} from "apache-arrow";
import { Buffers } from "apache-arrow/data";
import { type EmbeddingFunction } from "./embedding/embedding_function";
@@ -279,7 +278,7 @@ export class MakeArrowTableOptions {
}
/**
* An enhanced version of the {@link makeTable} function from Apache Arrow
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
* that supports nested fields and embeddings columns.
*
* (typically you do not need to call this function. It will be called automatically

View File

@@ -3,7 +3,6 @@
import {
Data,
Schema,
SchemaLike,
TableLike,
fromTableToStreamBuffer,

View File

@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
this.#schema,
);
}
/**
* Controls whether to use indexes for the merge operation.
*
* When set to `true` (the default), the operation will use an index if available
* on the join key for improved performance. When set to `false`, it forces a full
* table scan even if an index exists. This can be useful for benchmarking or when
* the query optimizer chooses a suboptimal path.
*
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
*/
useIndex(useIndex: boolean): MergeInsertBuilder {
return new MergeInsertBuilder(
this.#native.useIndex(useIndex),
this.#schema,
);
}
/**
* Executes the merge insert operation
*

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.22.1-beta.3",
"version": "0.22.1",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.22.1-beta.3",
"version": "0.22.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
self.inner.timeout(Duration::from_millis(timeout as u64));
}
#[napi]
pub fn use_index(&self, use_index: bool) -> Self {
let mut this = self.clone();
this.inner.use_index(use_index);
this
}
#[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
let data = ipc_file_to_batches(buf.to_vec())

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.25.1"
current_version = "0.25.2-beta.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.25.1"
version = "0.25.2-beta.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
self._when_not_matched_by_source_delete = False
self._when_not_matched_by_source_condition = None
self._timeout = None
self._use_index = True
def when_matched_update_all(
self, *, where: Optional[str] = None
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
self._when_not_matched_by_source_condition = condition
return self
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
"""
Controls whether to use indexes for the merge operation.
When set to `True` (the default), the operation will use an index if available
on the join key for improved performance. When set to `False`, it forces a full
table scan even if an index exists. This can be useful for benchmarking or when
the query optimizer chooses a suboptimal path.
Parameters
----------
use_index: bool
Whether to use indices for the merge operation. Defaults to `True`.
"""
self._use_index = use_index
return self
def execute(
self,
new_data: DATA,

View File

@@ -3920,6 +3920,7 @@ class AsyncTable:
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
timeout=merge._timeout,
use_index=merge._use_index,
),
)

View File

@@ -672,6 +672,9 @@ impl Table {
if let Some(timeout) = parameters.timeout {
builder.timeout(timeout);
}
if let Some(use_index) = parameters.use_index {
builder.use_index(use_index);
}
future_into_py(self_.py(), async move {
let res = builder.execute(Box::new(batches)).await.infer_error()?;
@@ -831,6 +834,7 @@ pub struct MergeInsertParams {
when_not_matched_by_source_delete: bool,
when_not_matched_by_source_condition: Option<String>,
timeout: Option<std::time::Duration>,
use_index: Option<bool>,
}
#[pyclass]

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.22.1-beta.3"
version = "0.22.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -36,6 +36,7 @@ lance-table = { workspace = true }
lance-linalg = { workspace = true }
lance-testing = { workspace = true }
lance-encoding = { workspace = true }
lance-namespace = { workspace = true }
moka = { workspace = true }
pin-project = { workspace = true }
tokio = { version = "1.23", features = ["rt-multi-thread"] }

View File

@@ -1015,6 +1015,117 @@ pub fn connect(uri: &str) -> ConnectBuilder {
ConnectBuilder::new(uri)
}
pub struct ConnectNamespaceBuilder {
ns_impl: String,
properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
embedding_registry: Option<Arc<dyn EmbeddingRegistry>>,
session: Option<Arc<lance::session::Session>>,
}
impl ConnectNamespaceBuilder {
fn new(ns_impl: &str, properties: HashMap<String, String>) -> Self {
Self {
ns_impl: ns_impl.to_string(),
properties,
storage_options: HashMap::new(),
read_consistency_interval: None,
embedding_registry: None,
session: None,
}
}
/// Set an option for the storage layer.
///
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.storage_options.insert(key.into(), value.into());
self
}
/// Set multiple options for the storage layer.
///
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
pub fn storage_options(
mut self,
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
) -> Self {
for (key, value) in pairs {
self.storage_options.insert(key.into(), value.into());
}
self
}
/// The interval at which to check for updates from other processes.
///
/// If left unset, consistency is not checked. For maximum read
/// performance, this is the default. For strong consistency, set this to
/// zero seconds. Then every read will check for updates from other processes.
/// As a compromise, set this to a non-zero duration for eventual consistency.
pub fn read_consistency_interval(
mut self,
read_consistency_interval: std::time::Duration,
) -> Self {
self.read_consistency_interval = Some(read_consistency_interval);
self
}
/// Provide a custom [`EmbeddingRegistry`] to use for this connection.
pub fn embedding_registry(mut self, registry: Arc<dyn EmbeddingRegistry>) -> Self {
self.embedding_registry = Some(registry);
self
}
/// Set a custom session for object stores and caching.
///
/// By default, a new session with default configuration will be created.
/// This method allows you to provide a custom session with your own
/// configuration for object store registries, caching, etc.
pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
self.session = Some(session);
self
}
/// Execute the connection
pub async fn execute(self) -> Result<Connection> {
use crate::database::namespace::LanceNamespaceDatabase;
let internal = Arc::new(
LanceNamespaceDatabase::connect(
&self.ns_impl,
self.properties,
self.storage_options,
self.read_consistency_interval,
self.session,
)
.await?,
);
Ok(Connection {
internal,
uri: format!("namespace://{}", self.ns_impl),
embedding_registry: self
.embedding_registry
.unwrap_or_else(|| Arc::new(MemoryRegistry::new())),
})
}
}
/// Connect to a LanceDB database through a namespace.
///
/// # Arguments
///
/// * `ns_impl` - The namespace implementation to use (e.g., "dir" for directory-based, "rest" for REST API)
/// * `properties` - Configuration properties for the namespace implementation
/// ```
pub fn connect_namespace(
ns_impl: &str,
properties: HashMap<String, String>,
) -> ConnectNamespaceBuilder {
ConnectNamespaceBuilder::new(ns_impl, properties)
}
#[cfg(all(test, feature = "remote"))]
mod test_utils {
use super::*;

View File

@@ -29,6 +29,7 @@ use crate::error::Result;
use crate::table::{BaseTable, TableDefinition, WriteOptions};
pub mod listing;
pub mod namespace;
pub trait DatabaseOptions {
fn serialize_into_map(&self, map: &mut HashMap<String, String>);

View File

@@ -0,0 +1,840 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Namespace-based database implementation that delegates table management to lance-namespace
use std::collections::HashMap;
use std::sync::Arc;
use async_trait::async_trait;
use lance_namespace::{
connect as connect_namespace,
models::{
CreateEmptyTableRequest, CreateNamespaceRequest, DescribeTableRequest,
DropNamespaceRequest, DropTableRequest, ListNamespacesRequest, ListTablesRequest,
},
LanceNamespace,
};
use crate::connection::ConnectRequest;
use crate::database::listing::ListingDatabase;
use crate::error::{Error, Result};
use super::{
BaseTable, CloneTableRequest, CreateNamespaceRequest as DbCreateNamespaceRequest,
CreateTableMode, CreateTableRequest as DbCreateTableRequest, Database,
DropNamespaceRequest as DbDropNamespaceRequest,
ListNamespacesRequest as DbListNamespacesRequest, OpenTableRequest, TableNamesRequest,
};
/// A database implementation that uses lance-namespace for table management
pub struct LanceNamespaceDatabase {
namespace: Arc<dyn LanceNamespace>,
// Storage options to be inherited by tables
storage_options: HashMap<String, String>,
// Read consistency interval for tables
read_consistency_interval: Option<std::time::Duration>,
// Optional session for object stores and caching
session: Option<Arc<lance::session::Session>>,
}
impl LanceNamespaceDatabase {
pub async fn connect(
ns_impl: &str,
ns_properties: HashMap<String, String>,
storage_options: HashMap<String, String>,
read_consistency_interval: Option<std::time::Duration>,
session: Option<Arc<lance::session::Session>>,
) -> Result<Self> {
let namespace = connect_namespace(ns_impl, ns_properties.clone())
.await
.map_err(|e| Error::InvalidInput {
message: format!("Failed to connect to namespace: {:?}", e),
})?;
Ok(Self {
namespace,
storage_options,
read_consistency_interval,
session,
})
}
/// Helper method to create a ListingDatabase from a table location
///
/// This method:
/// 1. Validates that the location ends with <table_name>.lance
/// 2. Extracts the parent directory from the location
/// 3. Creates a ListingDatabase at that parent directory
async fn create_listing_database(
&self,
table_name: &str,
location: &str,
additional_storage_options: Option<HashMap<String, String>>,
) -> Result<Arc<ListingDatabase>> {
let expected_suffix = format!("{}.lance", table_name);
if !location.ends_with(&expected_suffix) {
return Err(Error::Runtime {
message: format!(
"Invalid table location '{}': expected to end with '{}'",
location, expected_suffix
),
});
}
let parent_dir = location
.rsplit_once('/')
.map(|(parent, _)| parent.to_string())
.ok_or_else(|| Error::Runtime {
message: format!("Invalid table location '{}': no parent directory", location),
})?;
let mut merged_storage_options = self.storage_options.clone();
if let Some(opts) = additional_storage_options {
merged_storage_options.extend(opts);
}
let connect_request = ConnectRequest {
uri: parent_dir,
options: merged_storage_options,
read_consistency_interval: self.read_consistency_interval,
session: self.session.clone(),
#[cfg(feature = "remote")]
client_config: Default::default(),
};
let listing_db = ListingDatabase::connect_with_options(&connect_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to create listing database: {}", e),
})?;
Ok(Arc::new(listing_db))
}
}
impl std::fmt::Debug for LanceNamespaceDatabase {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("LanceNamespaceDatabase")
.field("storage_options", &self.storage_options)
.field("read_consistency_interval", &self.read_consistency_interval)
.finish()
}
}
impl std::fmt::Display for LanceNamespaceDatabase {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "LanceNamespaceDatabase")
}
}
#[async_trait]
impl Database for LanceNamespaceDatabase {
async fn list_namespaces(&self, request: DbListNamespacesRequest) -> Result<Vec<String>> {
let ns_request = ListNamespacesRequest {
id: if request.namespace.is_empty() {
None
} else {
Some(request.namespace)
},
page_token: request.page_token,
limit: request.limit.map(|l| l as i32),
};
let response = self
.namespace
.list_namespaces(ns_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to list namespaces: {}", e),
})?;
Ok(response.namespaces)
}
async fn create_namespace(&self, request: DbCreateNamespaceRequest) -> Result<()> {
let ns_request = CreateNamespaceRequest {
id: if request.namespace.is_empty() {
None
} else {
Some(request.namespace)
},
mode: None,
properties: None,
};
self.namespace
.create_namespace(ns_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to create namespace: {}", e),
})?;
Ok(())
}
async fn drop_namespace(&self, request: DbDropNamespaceRequest) -> Result<()> {
let ns_request = DropNamespaceRequest {
id: if request.namespace.is_empty() {
None
} else {
Some(request.namespace)
},
mode: None,
behavior: None,
};
self.namespace
.drop_namespace(ns_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to drop namespace: {}", e),
})?;
Ok(())
}
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
let ns_request = ListTablesRequest {
id: if request.namespace.is_empty() {
None
} else {
Some(request.namespace)
},
page_token: request.start_after,
limit: request.limit.map(|l| l as i32),
};
let response =
self.namespace
.list_tables(ns_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to list tables: {}", e),
})?;
Ok(response.tables)
}
async fn create_table(&self, request: DbCreateTableRequest) -> Result<Arc<dyn BaseTable>> {
let mut table_id = request.namespace.clone();
table_id.push(request.name.clone());
let describe_request = DescribeTableRequest {
id: Some(table_id.clone()),
version: None,
};
let describe_result = self.namespace.describe_table(describe_request).await;
match request.mode {
CreateTableMode::Create => {
if describe_result.is_ok() {
return Err(Error::TableAlreadyExists {
name: request.name.clone(),
});
}
}
CreateTableMode::Overwrite => {
if describe_result.is_ok() {
// Drop the existing table - must succeed
let drop_request = DropTableRequest {
id: Some(table_id.clone()),
};
self.namespace
.drop_table(drop_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to drop existing table for overwrite: {}", e),
})?;
}
}
CreateTableMode::ExistOk(_) => {
if let Ok(response) = describe_result {
let location = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from namespace response".to_string(),
})?;
let listing_db = self
.create_listing_database(&request.name, &location, response.storage_options)
.await?;
return listing_db
.open_table(OpenTableRequest {
name: request.name.clone(),
namespace: request.namespace.clone(),
index_cache_size: None,
lance_read_params: None,
})
.await;
}
}
}
let mut table_id = request.namespace.clone();
table_id.push(request.name.clone());
let create_empty_request = CreateEmptyTableRequest {
id: Some(table_id),
location: None,
properties: if self.storage_options.is_empty() {
None
} else {
Some(self.storage_options.clone())
},
};
let create_empty_response = self
.namespace
.create_empty_table(create_empty_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to create empty table: {}", e),
})?;
let location = create_empty_response
.location
.ok_or_else(|| Error::Runtime {
message: "Table location is missing from create_empty_table response".to_string(),
})?;
let listing_db = self
.create_listing_database(
&request.name,
&location,
create_empty_response.storage_options,
)
.await?;
listing_db.create_table(request).await
}
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
let mut table_id = request.namespace.clone();
table_id.push(request.name.clone());
let describe_request = DescribeTableRequest {
id: Some(table_id),
version: None,
};
let response = self
.namespace
.describe_table(describe_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to describe table: {}", e),
})?;
let location = response.location.ok_or_else(|| Error::Runtime {
message: "Table location is missing from namespace response".to_string(),
})?;
let listing_db = self
.create_listing_database(&request.name, &location, response.storage_options)
.await?;
listing_db.open_table(request).await
}
async fn clone_table(&self, _request: CloneTableRequest) -> Result<Arc<dyn BaseTable>> {
Err(Error::NotSupported {
message: "clone_table is not supported for namespace connections".to_string(),
})
}
async fn rename_table(
&self,
_cur_name: &str,
_new_name: &str,
_cur_namespace: &[String],
_new_namespace: &[String],
) -> Result<()> {
Err(Error::NotSupported {
message: "rename_table is not supported for namespace connections".to_string(),
})
}
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()> {
let mut table_id = namespace.to_vec();
table_id.push(name.to_string());
let drop_request = DropTableRequest { id: Some(table_id) };
self.namespace
.drop_table(drop_request)
.await
.map_err(|e| Error::Runtime {
message: format!("Failed to drop table: {}", e),
})?;
Ok(())
}
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
let tables = self
.table_names(TableNamesRequest {
namespace: namespace.to_vec(),
start_after: None,
limit: None,
})
.await?;
for table in tables {
self.drop_table(&table, namespace).await?;
}
Ok(())
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[cfg(test)]
#[cfg(not(windows))] // TODO: support windows for lance-namespace
mod tests {
use super::*;
use crate::connect_namespace;
use crate::query::ExecutableQuery;
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator, StringArray};
use arrow_schema::{DataType, Field, Schema};
use futures::TryStreamExt;
use tempfile::tempdir;
/// Helper function to create test data
fn create_test_data() -> RecordBatchIterator<
std::vec::IntoIter<std::result::Result<RecordBatch, arrow_schema::ArrowError>>,
> {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie", "David", "Eve"]);
let batch = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(id_array), Arc::new(name_array)],
)
.unwrap();
RecordBatchIterator::new(vec![std::result::Result::Ok(batch)].into_iter(), schema)
}
#[tokio::test]
async fn test_namespace_connection_simple() {
// Test that namespace connections work with simple connect_namespace(impl_type, properties)
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
// This should succeed with directory-based namespace
let result = connect_namespace("dir", properties).execute().await;
assert!(result.is_ok());
}
#[tokio::test]
async fn test_namespace_connection_with_storage_options() {
// Test namespace connections with storage options
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
// This should succeed with directory-based namespace and storage options
let result = connect_namespace("dir", properties)
.storage_option("timeout", "30s")
.execute()
.await;
assert!(result.is_ok());
}
#[tokio::test]
async fn test_namespace_connection_with_all_options() {
use crate::embeddings::MemoryRegistry;
use std::time::Duration;
// Test namespace connections with all configuration options
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let embedding_registry = Arc::new(MemoryRegistry::new());
let session = Arc::new(lance::session::Session::default());
// Test with all options set
let result = connect_namespace("dir", properties)
.storage_option("timeout", "30s")
.storage_options([("cache_size", "1gb"), ("region", "us-east-1")])
.read_consistency_interval(Duration::from_secs(5))
.embedding_registry(embedding_registry.clone())
.session(session.clone())
.execute()
.await;
assert!(result.is_ok());
let conn = result.unwrap();
// Verify embedding registry is set correctly
assert!(std::ptr::eq(
conn.embedding_registry() as *const _,
embedding_registry.as_ref() as *const _
));
}
#[tokio::test]
async fn test_namespace_create_table_basic() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
// Connect to namespace using DirectoryNamespace
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Test: Create a table
let test_data = create_test_data();
let table = conn
.create_table("test_table", test_data)
.execute()
.await
.expect("Failed to create table");
// Verify: Table was created and can be queried
let results = table
.query()
.execute()
.await
.expect("Failed to query table")
.try_collect::<Vec<_>>()
.await
.expect("Failed to collect results");
assert_eq!(results.len(), 1);
assert_eq!(results[0].num_rows(), 5);
// Verify: Table appears in table_names
let table_names = conn
.table_names()
.execute()
.await
.expect("Failed to list tables");
assert!(table_names.contains(&"test_table".to_string()));
}
#[tokio::test]
async fn test_namespace_describe_table() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
// Connect to namespace
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Create a table first
let test_data = create_test_data();
let _table = conn
.create_table("describe_test", test_data)
.execute()
.await
.expect("Failed to create table");
// Test: Open the table (which internally uses describe_table)
let opened_table = conn
.open_table("describe_test")
.execute()
.await
.expect("Failed to open table");
// Verify: Can query the opened table
let results = opened_table
.query()
.execute()
.await
.expect("Failed to query table")
.try_collect::<Vec<_>>()
.await
.expect("Failed to collect results");
assert_eq!(results.len(), 1);
assert_eq!(results[0].num_rows(), 5);
// Verify schema matches
let schema = opened_table.schema().await.expect("Failed to get schema");
assert_eq!(schema.fields.len(), 2);
assert_eq!(schema.field(0).name(), "id");
assert_eq!(schema.field(1).name(), "name");
}
#[tokio::test]
async fn test_namespace_create_table_overwrite_mode() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Create initial table with 5 rows
let test_data1 = create_test_data();
let _table1 = conn
.create_table("overwrite_test", test_data1)
.execute()
.await
.expect("Failed to create table");
// Create new data with 3 rows
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
let id_array = Int32Array::from(vec![10, 20, 30]);
let name_array = StringArray::from(vec!["New1", "New2", "New3"]);
let test_data2 = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(id_array), Arc::new(name_array)],
)
.unwrap();
// Test: Overwrite the table
let table2 = conn
.create_table(
"overwrite_test",
RecordBatchIterator::new(
vec![std::result::Result::Ok(test_data2)].into_iter(),
schema,
),
)
.mode(CreateTableMode::Overwrite)
.execute()
.await
.expect("Failed to overwrite table");
// Verify: Table has new data (3 rows instead of 5)
let results = table2
.query()
.execute()
.await
.expect("Failed to query table")
.try_collect::<Vec<_>>()
.await
.expect("Failed to collect results");
assert_eq!(results.len(), 1);
assert_eq!(results[0].num_rows(), 3);
// Verify the data is actually the new data
let id_col = results[0]
.column(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap();
assert_eq!(id_col.value(0), 10);
assert_eq!(id_col.value(1), 20);
assert_eq!(id_col.value(2), 30);
}
#[tokio::test]
async fn test_namespace_create_table_exist_ok_mode() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Create initial table with test data
let test_data1 = create_test_data();
let _table1 = conn
.create_table("exist_ok_test", test_data1)
.execute()
.await
.expect("Failed to create table");
// Try to create again with exist_ok mode
let test_data2 = create_test_data();
let table2 = conn
.create_table("exist_ok_test", test_data2)
.mode(CreateTableMode::exist_ok(|req| req))
.execute()
.await
.expect("Failed with exist_ok mode");
// Verify: Table still has original data (5 rows)
let results = table2
.query()
.execute()
.await
.expect("Failed to query table")
.try_collect::<Vec<_>>()
.await
.expect("Failed to collect results");
assert_eq!(results.len(), 1);
assert_eq!(results[0].num_rows(), 5);
}
#[tokio::test]
async fn test_namespace_create_multiple_tables() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Create first table
let test_data1 = create_test_data();
let _table1 = conn
.create_table("table1", test_data1)
.execute()
.await
.expect("Failed to create first table");
// Create second table
let test_data2 = create_test_data();
let _table2 = conn
.create_table("table2", test_data2)
.execute()
.await
.expect("Failed to create second table");
// Verify: Both tables appear in table list
let table_names = conn
.table_names()
.execute()
.await
.expect("Failed to list tables");
assert!(table_names.contains(&"table1".to_string()));
assert!(table_names.contains(&"table2".to_string()));
// Verify: Can open both tables
let opened_table1 = conn
.open_table("table1")
.execute()
.await
.expect("Failed to open table1");
let opened_table2 = conn
.open_table("table2")
.execute()
.await
.expect("Failed to open table2");
// Verify both tables work
let count1 = opened_table1
.count_rows(None)
.await
.expect("Failed to count rows in table1");
assert_eq!(count1, 5);
let count2 = opened_table2
.count_rows(None)
.await
.expect("Failed to count rows in table2");
assert_eq!(count2, 5);
}
#[tokio::test]
async fn test_namespace_table_not_found() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Test: Try to open a non-existent table
let result = conn.open_table("non_existent_table").execute().await;
// Verify: Should return an error
assert!(result.is_err());
}
#[tokio::test]
async fn test_namespace_drop_table() {
// Setup: Create a temporary directory for the namespace
let tmp_dir = tempdir().unwrap();
let root_path = tmp_dir.path().to_str().unwrap().to_string();
let mut properties = HashMap::new();
properties.insert("root".to_string(), root_path);
let conn = connect_namespace("dir", properties)
.execute()
.await
.expect("Failed to connect to namespace");
// Create a table first
let test_data = create_test_data();
let _table = conn
.create_table("drop_test", test_data)
.execute()
.await
.expect("Failed to create table");
// Verify table exists
let table_names_before = conn
.table_names()
.execute()
.await
.expect("Failed to list tables");
assert!(table_names_before.contains(&"drop_test".to_string()));
// Test: Drop the table
conn.drop_table("drop_test", &[])
.await
.expect("Failed to drop table");
// Verify: Table no longer exists
let table_names_after = conn
.table_names()
.execute()
.await
.expect("Failed to list tables");
assert!(!table_names_after.contains(&"drop_test".to_string()));
// Verify: Cannot open dropped table
let open_result = conn.open_table("drop_test").execute().await;
assert!(open_result.is_err());
}
}

View File

@@ -212,7 +212,7 @@ use std::fmt::Display;
use serde::{Deserialize, Serialize};
pub use connection::Connection;
pub use connection::{ConnectNamespaceBuilder, Connection};
pub use error::{Error, Result};
use lance_linalg::distance::DistanceType as LanceDistanceType;
pub use table::Table;
@@ -289,6 +289,8 @@ impl Display for DistanceType {
/// Connect to a database
pub use connection::connect;
/// Connect to a namespace-backed database
pub use connection::connect_namespace;
/// Re-export Lance Session and ObjectStoreRegistry for custom session creation
pub use lance::session::Session;

View File

@@ -1452,6 +1452,14 @@ struct MergeInsertRequest {
when_not_matched_insert_all: bool,
when_not_matched_by_source_delete: bool,
when_not_matched_by_source_delete_filt: Option<String>,
// For backwards compatibility, only serialize use_index when it's false
// (the default is true)
#[serde(skip_serializing_if = "is_true")]
use_index: bool,
}
fn is_true(b: &bool) -> bool {
*b
}
impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
@@ -1476,6 +1484,8 @@ impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
when_not_matched_insert_all: value.when_not_matched_insert_all,
when_not_matched_by_source_delete: value.when_not_matched_by_source_delete,
when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt,
// Only serialize use_index when it's false for backwards compatibility
use_index: value.use_index,
})
}
}
@@ -1942,6 +1952,7 @@ mod tests {
assert_eq!(params["when_not_matched_by_source_delete"], "false");
assert!(!params.contains_key("when_matched_update_all_filt"));
assert!(!params.contains_key("when_not_matched_by_source_delete_filt"));
assert!(!params.contains_key("use_index"));
if old_server {
http::Response::builder().status(200).body("{}").unwrap()

View File

@@ -2399,6 +2399,7 @@ impl BaseTable for NativeTable {
} else {
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
}
builder.use_index(params.use_index);
let future = if let Some(timeout) = params.timeout {
// The default retry timeout is 30s, so we pass the full timeout down
@@ -2906,6 +2907,38 @@ mod tests {
);
}
#[tokio::test]
async fn test_merge_insert_use_index() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = connect(uri).execute().await.unwrap();
// Create a dataset with i=0..10
let batches = merge_insert_test_batches(0, 0);
let table = conn
.create_table("my_table", batches)
.execute()
.await
.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 10);
// Test use_index=true (default behavior)
let new_batches = Box::new(merge_insert_test_batches(5, 1));
let mut merge_insert_builder = table.merge_insert(&["i"]);
merge_insert_builder.when_not_matched_insert_all();
merge_insert_builder.use_index(true);
merge_insert_builder.execute(new_batches).await.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 15);
// Test use_index=false (force table scan)
let new_batches = Box::new(merge_insert_test_batches(15, 2));
let mut merge_insert_builder = table.merge_insert(&["i"]);
merge_insert_builder.when_not_matched_insert_all();
merge_insert_builder.use_index(false);
merge_insert_builder.execute(new_batches).await.unwrap();
assert_eq!(table.count_rows(None).await.unwrap(), 25);
}
#[tokio::test]
async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap();

View File

@@ -22,6 +22,7 @@ pub struct MergeInsertBuilder {
pub(crate) when_not_matched_by_source_delete: bool,
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
pub(crate) timeout: Option<Duration>,
pub(crate) use_index: bool,
}
impl MergeInsertBuilder {
@@ -35,6 +36,7 @@ impl MergeInsertBuilder {
when_not_matched_by_source_delete: false,
when_not_matched_by_source_delete_filt: None,
timeout: None,
use_index: true,
}
}
@@ -101,6 +103,19 @@ impl MergeInsertBuilder {
self
}
/// Controls whether to use indexes for the merge operation.
///
/// When set to `true` (the default), the operation will use an index if available
/// on the join key for improved performance. When set to `false`, it forces a full
/// table scan even if an index exists. This can be useful for benchmarking or when
/// the query optimizer chooses a suboptimal path.
///
/// If not set, defaults to `true` (use index if available).
pub fn use_index(&mut self, use_index: bool) -> &mut Self {
self.use_index = use_index;
self
}
/// Executes the merge insert operation
///
/// Returns version and statistics about the merge operation including the number of rows