mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
Compare commits
14 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
247fb58400 | ||
|
|
504bdc471c | ||
|
|
d617cdef4a | ||
|
|
356d7046fd | ||
|
|
48e5caabda | ||
|
|
d6cc68f671 | ||
|
|
55eacfa685 | ||
|
|
222e3264ab | ||
|
|
13505026cb | ||
|
|
b0800b4b71 | ||
|
|
1befebf614 | ||
|
|
1ab60fae7f | ||
|
|
e921c90c1b | ||
|
|
05a4ea646a |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.22.1-beta.2"
|
current_version = "0.22.1"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
2
.github/workflows/nodejs.yml
vendored
2
.github/workflows/nodejs.yml
vendored
@@ -116,7 +116,7 @@ jobs:
|
|||||||
set -e
|
set -e
|
||||||
npm ci
|
npm ci
|
||||||
npm run docs
|
npm run docs
|
||||||
if ! git diff --exit-code -- . ':(exclude)Cargo.lock'; then
|
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||||
echo "Docs need to be updated"
|
echo "Docs need to be updated"
|
||||||
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
1572
Cargo.lock
generated
1572
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
17
Cargo.toml
17
Cargo.toml
@@ -15,14 +15,15 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.78.0"
|
rust-version = "1.78.0"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance = { "version" = "=0.37.0", default-features = false, "features" = ["dynamodb"] }
|
||||||
lance-io = { "version" = "=0.35.0", default-features = false, "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-io = { "version" = "=0.37.0", default-features = false }
|
||||||
lance-index = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-index = "=0.37.0"
|
||||||
lance-linalg = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-linalg = "=0.37.0"
|
||||||
lance-table = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-table = "=0.37.0"
|
||||||
lance-testing = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-testing = "=0.37.0"
|
||||||
lance-datafusion = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-datafusion = "=0.37.0"
|
||||||
lance-encoding = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
lance-encoding = "=0.37.0"
|
||||||
|
lance-namespace = "0.0.15"
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "55.1", optional = false }
|
arrow = { version = "55.1", optional = false }
|
||||||
arrow-array = "55.1"
|
arrow-array = "55.1"
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@@ -18,8 +19,12 @@ def run_command(command: str) -> str:
|
|||||||
|
|
||||||
def get_latest_stable_version() -> str:
|
def get_latest_stable_version() -> str:
|
||||||
version_line = run_command("cargo info lance | grep '^version:'")
|
version_line = run_command("cargo info lance | grep '^version:'")
|
||||||
version = version_line.split(" ")[1].strip()
|
# Example output: "version: 0.35.0 (latest 0.37.0)"
|
||||||
return version
|
match = re.search(r'\(latest ([0-9.]+)\)', version_line)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
# Fallback: use the first version after 'version:'
|
||||||
|
return version_line.split("version:")[1].split()[0].strip()
|
||||||
|
|
||||||
|
|
||||||
def get_latest_preview_version() -> str:
|
def get_latest_preview_version() -> str:
|
||||||
|
|||||||
@@ -25,6 +25,51 @@ the underlying connection has been closed.
|
|||||||
|
|
||||||
## Methods
|
## Methods
|
||||||
|
|
||||||
|
### cloneTable()
|
||||||
|
|
||||||
|
```ts
|
||||||
|
abstract cloneTable(
|
||||||
|
targetTableName,
|
||||||
|
sourceUri,
|
||||||
|
options?): Promise<Table>
|
||||||
|
```
|
||||||
|
|
||||||
|
Clone a table from a source table.
|
||||||
|
|
||||||
|
A shallow clone creates a new table that shares the underlying data files
|
||||||
|
with the source table but has its own independent manifest. This allows
|
||||||
|
both the source and cloned tables to evolve independently while initially
|
||||||
|
sharing the same data, deletion, and index files.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
* **targetTableName**: `string`
|
||||||
|
The name of the target table to create.
|
||||||
|
|
||||||
|
* **sourceUri**: `string`
|
||||||
|
The URI of the source table to clone from.
|
||||||
|
|
||||||
|
* **options?**
|
||||||
|
Clone options.
|
||||||
|
|
||||||
|
* **options.isShallow?**: `boolean`
|
||||||
|
Whether to perform a shallow clone (defaults to true).
|
||||||
|
|
||||||
|
* **options.sourceTag?**: `string`
|
||||||
|
The tag of the source table to clone.
|
||||||
|
|
||||||
|
* **options.sourceVersion?**: `number`
|
||||||
|
The version of the source table to clone.
|
||||||
|
|
||||||
|
* **options.targetNamespace?**: `string`[]
|
||||||
|
The namespace for the target table (defaults to root namespace).
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
`Promise`<[`Table`](Table.md)>
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### close()
|
### close()
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ function makeArrowTable(
|
|||||||
metadata?): ArrowTable
|
metadata?): ArrowTable
|
||||||
```
|
```
|
||||||
|
|
||||||
An enhanced version of the makeTable function from Apache Arrow
|
An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||||
that supports nested fields and embeddings columns.
|
that supports nested fields and embeddings columns.
|
||||||
|
|
||||||
(typically you do not need to call this function. It will be called automatically
|
(typically you do not need to call this function. It will be called automatically
|
||||||
|
|||||||
@@ -78,6 +78,7 @@
|
|||||||
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
||||||
- [TableStatistics](interfaces/TableStatistics.md)
|
- [TableStatistics](interfaces/TableStatistics.md)
|
||||||
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
||||||
|
- [TlsConfig](interfaces/TlsConfig.md)
|
||||||
- [TokenResponse](interfaces/TokenResponse.md)
|
- [TokenResponse](interfaces/TokenResponse.md)
|
||||||
- [UpdateOptions](interfaces/UpdateOptions.md)
|
- [UpdateOptions](interfaces/UpdateOptions.md)
|
||||||
- [UpdateResult](interfaces/UpdateResult.md)
|
- [UpdateResult](interfaces/UpdateResult.md)
|
||||||
|
|||||||
@@ -40,6 +40,14 @@ optional timeoutConfig: TimeoutConfig;
|
|||||||
|
|
||||||
***
|
***
|
||||||
|
|
||||||
|
### tlsConfig?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional tlsConfig: TlsConfig;
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
### userAgent?
|
### userAgent?
|
||||||
|
|
||||||
```ts
|
```ts
|
||||||
|
|||||||
49
docs/src/js/interfaces/TlsConfig.md
Normal file
49
docs/src/js/interfaces/TlsConfig.md
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / TlsConfig
|
||||||
|
|
||||||
|
# Interface: TlsConfig
|
||||||
|
|
||||||
|
TLS/mTLS configuration for the remote HTTP client.
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### assertHostname?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional assertHostname: boolean;
|
||||||
|
```
|
||||||
|
|
||||||
|
Whether to verify the hostname in the server's certificate.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### certFile?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional certFile: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the client certificate file (PEM format) for mTLS authentication.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### keyFile?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional keyFile: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the client private key file (PEM format) for mTLS authentication.
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
### sslCaCert?
|
||||||
|
|
||||||
|
```ts
|
||||||
|
optional sslCaCert: string;
|
||||||
|
```
|
||||||
|
|
||||||
|
Path to the CA certificate file (PEM format) for server verification.
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.2</version>
|
<version>0.22.1-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.2</version>
|
<version>0.22.1-final.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.22.1-beta.2</version>
|
<version>0.22.1-final.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>${project.artifactId}</name>
|
<name>${project.artifactId}</name>
|
||||||
<description>LanceDB Java SDK Parent POM</description>
|
<description>LanceDB Java SDK Parent POM</description>
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.22.1-beta.2"
|
version = "0.22.1"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -1,17 +1,5 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
import {
|
|
||||||
Bool,
|
|
||||||
Field,
|
|
||||||
Int32,
|
|
||||||
List,
|
|
||||||
Schema,
|
|
||||||
Struct,
|
|
||||||
Uint8,
|
|
||||||
Utf8,
|
|
||||||
} from "apache-arrow";
|
|
||||||
|
|
||||||
import * as arrow15 from "apache-arrow-15";
|
import * as arrow15 from "apache-arrow-15";
|
||||||
import * as arrow16 from "apache-arrow-16";
|
import * as arrow16 from "apache-arrow-16";
|
||||||
import * as arrow17 from "apache-arrow-17";
|
import * as arrow17 from "apache-arrow-17";
|
||||||
@@ -25,11 +13,9 @@ import {
|
|||||||
fromTableToBuffer,
|
fromTableToBuffer,
|
||||||
makeArrowTable,
|
makeArrowTable,
|
||||||
makeEmptyTable,
|
makeEmptyTable,
|
||||||
tableFromIPC,
|
|
||||||
} from "../lancedb/arrow";
|
} from "../lancedb/arrow";
|
||||||
import {
|
import {
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
FieldOptions,
|
|
||||||
FunctionOptions,
|
FunctionOptions,
|
||||||
} from "../lancedb/embedding/embedding_function";
|
} from "../lancedb/embedding/embedding_function";
|
||||||
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
||||||
@@ -1008,5 +994,64 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|||||||
expect(result).toEqual(null);
|
expect(result).toEqual(null);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("boolean null handling", function () {
|
||||||
|
it("should handle null values in nullable boolean fields", () => {
|
||||||
|
const { makeArrowTable } = require("../lancedb/arrow");
|
||||||
|
const schema = new Schema([new Field("test", new arrow.Bool(), true)]);
|
||||||
|
|
||||||
|
// Test with all null values
|
||||||
|
const data = [{ test: null }];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
expect(table.numRows).toBe(1);
|
||||||
|
expect(table.schema.names).toEqual(["test"]);
|
||||||
|
expect(table.getChild("test")!.get(0)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle mixed null and non-null boolean values", () => {
|
||||||
|
const { makeArrowTable } = require("../lancedb/arrow");
|
||||||
|
const schema = new Schema([new Field("test", new Bool(), true)]);
|
||||||
|
|
||||||
|
// Test with mixed values
|
||||||
|
const data = [{ test: true }, { test: null }, { test: false }];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
expect(table.numRows).toBe(3);
|
||||||
|
expect(table.getChild("test")!.get(0)).toBe(true);
|
||||||
|
expect(table.getChild("test")!.get(1)).toBeNull();
|
||||||
|
expect(table.getChild("test")!.get(2)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test for the undefined values bug fix
|
||||||
|
describe("undefined values handling", () => {
|
||||||
|
it("should handle mixed undefined and actual values", () => {
|
||||||
|
const schema = new Schema([
|
||||||
|
new Field("text", new Utf8(), true), // nullable
|
||||||
|
new Field("number", new Int32(), true), // nullable
|
||||||
|
new Field("bool", new Bool(), true), // nullable
|
||||||
|
]);
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ text: undefined, number: 42, bool: true },
|
||||||
|
{ text: "hello", number: undefined, bool: false },
|
||||||
|
{ text: "world", number: 123, bool: undefined },
|
||||||
|
];
|
||||||
|
const table = makeArrowTable(data, { schema });
|
||||||
|
|
||||||
|
const result = table.toArray();
|
||||||
|
expect(result).toHaveLength(3);
|
||||||
|
expect(result[0].text).toBe(null);
|
||||||
|
expect(result[0].number).toBe(42);
|
||||||
|
expect(result[0].bool).toBe(true);
|
||||||
|
expect(result[1].text).toBe("hello");
|
||||||
|
expect(result[1].number).toBe(null);
|
||||||
|
expect(result[1].bool).toBe(false);
|
||||||
|
expect(result[2].text).toBe("world");
|
||||||
|
expect(result[2].number).toBe(123);
|
||||||
|
expect(result[2].bool).toBe(null);
|
||||||
|
});
|
||||||
|
});
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import {
|
|||||||
ClientConfig,
|
ClientConfig,
|
||||||
Connection,
|
Connection,
|
||||||
ConnectionOptions,
|
ConnectionOptions,
|
||||||
NativeJsHeaderProvider,
|
|
||||||
TlsConfig,
|
TlsConfig,
|
||||||
connect,
|
connect,
|
||||||
} from "../lancedb";
|
} from "../lancedb";
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ import {
|
|||||||
Operator,
|
Operator,
|
||||||
instanceOfFullTextQuery,
|
instanceOfFullTextQuery,
|
||||||
} from "../lancedb/query";
|
} from "../lancedb/query";
|
||||||
import exp = require("constants");
|
|
||||||
|
|
||||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||||
"Given a table",
|
"Given a table",
|
||||||
@@ -488,6 +487,32 @@ describe("merge insert", () => {
|
|||||||
.execute(newData, { timeoutMs: 0 }),
|
.execute(newData, { timeoutMs: 0 }),
|
||||||
).rejects.toThrow("merge insert timed out");
|
).rejects.toThrow("merge insert timed out");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("useIndex", async () => {
|
||||||
|
const newData = [
|
||||||
|
{ a: 2, b: "x" },
|
||||||
|
{ a: 4, b: "z" },
|
||||||
|
];
|
||||||
|
|
||||||
|
// Test with useIndex(true) - should work fine
|
||||||
|
const result1 = await table
|
||||||
|
.mergeInsert("a")
|
||||||
|
.whenNotMatchedInsertAll()
|
||||||
|
.useIndex(true)
|
||||||
|
.execute(newData);
|
||||||
|
|
||||||
|
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
|
||||||
|
|
||||||
|
// Test with useIndex(false) - should also work fine
|
||||||
|
const newData2 = [{ a: 5, b: "w" }];
|
||||||
|
const result2 = await table
|
||||||
|
.mergeInsert("a")
|
||||||
|
.whenNotMatchedInsertAll()
|
||||||
|
.useIndex(false)
|
||||||
|
.execute(newData2);
|
||||||
|
|
||||||
|
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("When creating an index", () => {
|
describe("When creating an index", () => {
|
||||||
|
|||||||
@@ -48,6 +48,7 @@
|
|||||||
"noUnreachableSuper": "error",
|
"noUnreachableSuper": "error",
|
||||||
"noUnsafeFinally": "error",
|
"noUnsafeFinally": "error",
|
||||||
"noUnsafeOptionalChaining": "error",
|
"noUnsafeOptionalChaining": "error",
|
||||||
|
"noUnusedImports": "error",
|
||||||
"noUnusedLabels": "error",
|
"noUnusedLabels": "error",
|
||||||
"noUnusedVariables": "warn",
|
"noUnusedVariables": "warn",
|
||||||
"useIsNan": "error",
|
"useIsNan": "error",
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ import {
|
|||||||
vectorFromArray as badVectorFromArray,
|
vectorFromArray as badVectorFromArray,
|
||||||
makeBuilder,
|
makeBuilder,
|
||||||
makeData,
|
makeData,
|
||||||
makeTable,
|
|
||||||
} from "apache-arrow";
|
} from "apache-arrow";
|
||||||
import { Buffers } from "apache-arrow/data";
|
import { Buffers } from "apache-arrow/data";
|
||||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||||
@@ -279,7 +278,7 @@ export class MakeArrowTableOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||||
* that supports nested fields and embeddings columns.
|
* that supports nested fields and embeddings columns.
|
||||||
*
|
*
|
||||||
* (typically you do not need to call this function. It will be called automatically
|
* (typically you do not need to call this function. It will be called automatically
|
||||||
@@ -705,7 +704,7 @@ function transposeData(
|
|||||||
}
|
}
|
||||||
return current;
|
return current;
|
||||||
});
|
});
|
||||||
return makeVector(values, field.type);
|
return makeVector(values, field.type, undefined, field.nullable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -752,9 +751,30 @@ function makeVector(
|
|||||||
values: unknown[],
|
values: unknown[],
|
||||||
type?: DataType,
|
type?: DataType,
|
||||||
stringAsDictionary?: boolean,
|
stringAsDictionary?: boolean,
|
||||||
|
nullable?: boolean,
|
||||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||||
): Vector<any> {
|
): Vector<any> {
|
||||||
if (type !== undefined) {
|
if (type !== undefined) {
|
||||||
|
// Convert undefined values to null for nullable fields
|
||||||
|
if (nullable) {
|
||||||
|
values = values.map((v) => (v === undefined ? null : v));
|
||||||
|
}
|
||||||
|
|
||||||
|
// workaround for: https://github.com/apache/arrow-js/issues/68
|
||||||
|
if (DataType.isBool(type)) {
|
||||||
|
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
|
||||||
|
if (!hasNonNullValue) {
|
||||||
|
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
|
||||||
|
const data = makeData({
|
||||||
|
type: type,
|
||||||
|
length: values.length,
|
||||||
|
nullCount: values.length,
|
||||||
|
nullBitmap,
|
||||||
|
});
|
||||||
|
return arrowMakeVector(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// No need for inference, let Arrow create it
|
// No need for inference, let Arrow create it
|
||||||
if (type instanceof Int) {
|
if (type instanceof Int) {
|
||||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
if (DataType.isInt(type) && type.bitWidth === 64) {
|
||||||
@@ -879,7 +899,12 @@ async function applyEmbeddingsFromMetadata(
|
|||||||
for (const field of schema.fields) {
|
for (const field of schema.fields) {
|
||||||
if (!(field.name in columns)) {
|
if (!(field.name in columns)) {
|
||||||
const nullValues = new Array(table.numRows).fill(null);
|
const nullValues = new Array(table.numRows).fill(null);
|
||||||
columns[field.name] = makeVector(nullValues, field.type);
|
columns[field.name] = makeVector(
|
||||||
|
nullValues,
|
||||||
|
field.type,
|
||||||
|
undefined,
|
||||||
|
field.nullable,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -943,7 +968,12 @@ async function applyEmbeddings<T>(
|
|||||||
} else if (schema != null) {
|
} else if (schema != null) {
|
||||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
const destField = schema.fields.find((f) => f.name === destColumn);
|
||||||
if (destField != null) {
|
if (destField != null) {
|
||||||
newColumns[destColumn] = makeVector([], destField.type);
|
newColumns[destColumn] = makeVector(
|
||||||
|
[],
|
||||||
|
destField.type,
|
||||||
|
undefined,
|
||||||
|
destField.nullable,
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
Data,
|
Data,
|
||||||
Schema,
|
|
||||||
SchemaLike,
|
SchemaLike,
|
||||||
TableLike,
|
TableLike,
|
||||||
fromTableToStreamBuffer,
|
fromTableToStreamBuffer,
|
||||||
|
|||||||
@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
|
|||||||
this.#schema,
|
this.#schema,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls whether to use indexes for the merge operation.
|
||||||
|
*
|
||||||
|
* When set to `true` (the default), the operation will use an index if available
|
||||||
|
* on the join key for improved performance. When set to `false`, it forces a full
|
||||||
|
* table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
* the query optimizer chooses a suboptimal path.
|
||||||
|
*
|
||||||
|
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
||||||
|
*/
|
||||||
|
useIndex(useIndex: boolean): MergeInsertBuilder {
|
||||||
|
return new MergeInsertBuilder(
|
||||||
|
this.#native.useIndex(useIndex),
|
||||||
|
this.#schema,
|
||||||
|
);
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Executes the merge insert operation
|
* Executes the merge insert operation
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-musl.node",
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-musl.node",
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"private": false,
|
"private": false,
|
||||||
"version": "0.22.1-beta.2",
|
"version": "0.22.1",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
|
|||||||
@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
|
|||||||
self.inner.timeout(Duration::from_millis(timeout as u64));
|
self.inner.timeout(Duration::from_millis(timeout as u64));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn use_index(&self, use_index: bool) -> Self {
|
||||||
|
let mut this = self.clone();
|
||||||
|
this.inner.use_index(use_index);
|
||||||
|
this
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
||||||
let data = ipc_file_to_batches(buf.to_vec())
|
let data = ipc_file_to_batches(buf.to_vec())
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.25.1-beta.3"
|
current_version = "0.25.2-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.25.1-beta.3"
|
version = "0.25.2-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
|
|||||||
self._when_not_matched_by_source_delete = False
|
self._when_not_matched_by_source_delete = False
|
||||||
self._when_not_matched_by_source_condition = None
|
self._when_not_matched_by_source_condition = None
|
||||||
self._timeout = None
|
self._timeout = None
|
||||||
|
self._use_index = True
|
||||||
|
|
||||||
def when_matched_update_all(
|
def when_matched_update_all(
|
||||||
self, *, where: Optional[str] = None
|
self, *, where: Optional[str] = None
|
||||||
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
|
|||||||
self._when_not_matched_by_source_condition = condition
|
self._when_not_matched_by_source_condition = condition
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
|
||||||
|
"""
|
||||||
|
Controls whether to use indexes for the merge operation.
|
||||||
|
|
||||||
|
When set to `True` (the default), the operation will use an index if available
|
||||||
|
on the join key for improved performance. When set to `False`, it forces a full
|
||||||
|
table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
the query optimizer chooses a suboptimal path.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
use_index: bool
|
||||||
|
Whether to use indices for the merge operation. Defaults to `True`.
|
||||||
|
"""
|
||||||
|
self._use_index = use_index
|
||||||
|
return self
|
||||||
|
|
||||||
def execute(
|
def execute(
|
||||||
self,
|
self,
|
||||||
new_data: DATA,
|
new_data: DATA,
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from .linear_combination import LinearCombinationReranker
|
|||||||
from .openai import OpenaiReranker
|
from .openai import OpenaiReranker
|
||||||
from .jinaai import JinaReranker
|
from .jinaai import JinaReranker
|
||||||
from .rrf import RRFReranker
|
from .rrf import RRFReranker
|
||||||
|
from .mrr import MRRReranker
|
||||||
from .answerdotai import AnswerdotaiRerankers
|
from .answerdotai import AnswerdotaiRerankers
|
||||||
from .voyageai import VoyageAIReranker
|
from .voyageai import VoyageAIReranker
|
||||||
|
|
||||||
@@ -23,4 +24,5 @@ __all__ = [
|
|||||||
"RRFReranker",
|
"RRFReranker",
|
||||||
"AnswerdotaiRerankers",
|
"AnswerdotaiRerankers",
|
||||||
"VoyageAIReranker",
|
"VoyageAIReranker",
|
||||||
|
"MRRReranker",
|
||||||
]
|
]
|
||||||
|
|||||||
169
python/python/lancedb/rerankers/mrr.py
Normal file
169
python/python/lancedb/rerankers/mrr.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Union, List, TYPE_CHECKING
|
||||||
|
import pyarrow as pa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from .base import Reranker
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..table import LanceVectorQueryBuilder
|
||||||
|
|
||||||
|
|
||||||
|
class MRRReranker(Reranker):
|
||||||
|
"""
|
||||||
|
Reranks the results using Mean Reciprocal Rank (MRR) algorithm based
|
||||||
|
on the scores of vector and FTS search.
|
||||||
|
Algorithm reference - https://en.wikipedia.org/wiki/Mean_reciprocal_rank
|
||||||
|
|
||||||
|
MRR calculates the average of reciprocal ranks across different search results.
|
||||||
|
For each document, it computes the reciprocal of its rank in each system,
|
||||||
|
then takes the mean of these reciprocal ranks as the final score.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
weight_vector : float, default 0.5
|
||||||
|
Weight for vector search results (0.0 to 1.0)
|
||||||
|
weight_fts : float, default 0.5
|
||||||
|
Weight for FTS search results (0.0 to 1.0)
|
||||||
|
Note: weight_vector + weight_fts should equal 1.0
|
||||||
|
return_score : str, default "relevance"
|
||||||
|
Options are "relevance" or "all"
|
||||||
|
The type of score to return. If "relevance", will return only the relevance
|
||||||
|
score. If "all", will return all scores from the vector and FTS search along
|
||||||
|
with the relevance score.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight_vector: float = 0.5,
|
||||||
|
weight_fts: float = 0.5,
|
||||||
|
return_score="relevance",
|
||||||
|
):
|
||||||
|
if not (0.0 <= weight_vector <= 1.0):
|
||||||
|
raise ValueError("weight_vector must be between 0.0 and 1.0")
|
||||||
|
if not (0.0 <= weight_fts <= 1.0):
|
||||||
|
raise ValueError("weight_fts must be between 0.0 and 1.0")
|
||||||
|
if abs(weight_vector + weight_fts - 1.0) > 1e-6:
|
||||||
|
raise ValueError("weight_vector + weight_fts must equal 1.0")
|
||||||
|
|
||||||
|
super().__init__(return_score)
|
||||||
|
self.weight_vector = weight_vector
|
||||||
|
self.weight_fts = weight_fts
|
||||||
|
|
||||||
|
def rerank_hybrid(
|
||||||
|
self,
|
||||||
|
query: str, # noqa: F821
|
||||||
|
vector_results: pa.Table,
|
||||||
|
fts_results: pa.Table,
|
||||||
|
):
|
||||||
|
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
|
||||||
|
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
|
||||||
|
|
||||||
|
# Maps result_id to list of (type, reciprocal_rank)
|
||||||
|
mrr_score_map = defaultdict(list)
|
||||||
|
|
||||||
|
if vector_ids:
|
||||||
|
for rank, result_id in enumerate(vector_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(("vector", reciprocal_rank))
|
||||||
|
|
||||||
|
if fts_ids:
|
||||||
|
for rank, result_id in enumerate(fts_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(("fts", reciprocal_rank))
|
||||||
|
|
||||||
|
final_mrr_scores = {}
|
||||||
|
for result_id, scores in mrr_score_map.items():
|
||||||
|
vector_rr = 0.0
|
||||||
|
fts_rr = 0.0
|
||||||
|
|
||||||
|
for score_type, reciprocal_rank in scores:
|
||||||
|
if score_type == "vector":
|
||||||
|
vector_rr = reciprocal_rank
|
||||||
|
elif score_type == "fts":
|
||||||
|
fts_rr = reciprocal_rank
|
||||||
|
|
||||||
|
# If a document doesn't appear, its reciprocal rank is 0
|
||||||
|
weighted_mrr = self.weight_vector * vector_rr + self.weight_fts * fts_rr
|
||||||
|
final_mrr_scores[result_id] = weighted_mrr
|
||||||
|
|
||||||
|
combined_results = self.merge_results(vector_results, fts_results)
|
||||||
|
combined_row_ids = combined_results["_rowid"].to_pylist()
|
||||||
|
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||||
|
combined_results = combined_results.append_column(
|
||||||
|
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||||
|
)
|
||||||
|
combined_results = combined_results.sort_by(
|
||||||
|
[("_relevance_score", "descending")]
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.score == "relevance":
|
||||||
|
combined_results = self._keep_relevance_score(combined_results)
|
||||||
|
|
||||||
|
return combined_results
|
||||||
|
|
||||||
|
def rerank_multivector(
|
||||||
|
self,
|
||||||
|
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
|
||||||
|
query: str = None,
|
||||||
|
deduplicate: bool = True, # noqa: F821
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Reranks the results from multiple vector searches using MRR algorithm.
|
||||||
|
Each vector search result is treated as a separate ranking system,
|
||||||
|
and MRR calculates the mean of reciprocal ranks across all systems.
|
||||||
|
This cannot reuse rerank_hybrid because MRR semantics require treating
|
||||||
|
each vector result as a separate ranking system.
|
||||||
|
"""
|
||||||
|
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
|
||||||
|
raise ValueError(
|
||||||
|
"All elements in vector_results should be of the same type"
|
||||||
|
)
|
||||||
|
|
||||||
|
# avoid circular import
|
||||||
|
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
|
||||||
|
vector_results = [result.to_arrow() for result in vector_results]
|
||||||
|
elif not isinstance(vector_results[0], pa.Table):
|
||||||
|
raise ValueError(
|
||||||
|
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not all("_rowid" in result.column_names for result in vector_results):
|
||||||
|
raise ValueError(
|
||||||
|
"'_rowid' is required for deduplication. \
|
||||||
|
add _rowid to search results like this: \
|
||||||
|
`search().with_row_id(True)`"
|
||||||
|
)
|
||||||
|
|
||||||
|
mrr_score_map = defaultdict(list)
|
||||||
|
|
||||||
|
for result_table in vector_results:
|
||||||
|
result_ids = result_table["_rowid"].to_pylist()
|
||||||
|
for rank, result_id in enumerate(result_ids, 1):
|
||||||
|
reciprocal_rank = 1.0 / rank
|
||||||
|
mrr_score_map[result_id].append(reciprocal_rank)
|
||||||
|
|
||||||
|
final_mrr_scores = {}
|
||||||
|
for result_id, reciprocal_ranks in mrr_score_map.items():
|
||||||
|
mean_rr = np.mean(reciprocal_ranks)
|
||||||
|
final_mrr_scores[result_id] = mean_rr
|
||||||
|
|
||||||
|
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
|
||||||
|
combined = self._deduplicate(combined)
|
||||||
|
|
||||||
|
combined_row_ids = combined["_rowid"].to_pylist()
|
||||||
|
|
||||||
|
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||||
|
combined = combined.append_column(
|
||||||
|
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||||
|
)
|
||||||
|
combined = combined.sort_by([("_relevance_score", "descending")])
|
||||||
|
|
||||||
|
if self.score == "relevance":
|
||||||
|
combined = self._keep_relevance_score(combined)
|
||||||
|
|
||||||
|
return combined
|
||||||
@@ -1470,10 +1470,7 @@ class Table(ABC):
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -2835,10 +2832,7 @@ class LanceTable(Table):
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -3926,6 +3920,7 @@ class AsyncTable:
|
|||||||
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
||||||
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
||||||
timeout=merge._timeout,
|
timeout=merge._timeout,
|
||||||
|
use_index=merge._use_index,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -4298,10 +4293,7 @@ class AsyncTable:
|
|||||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||||
then these files will be deleted regardless of their age.
|
then these files will be deleted regardless of their age.
|
||||||
retrain: bool, default False
|
retrain: bool, default False
|
||||||
If True, retrain the vector indices, this would refine the IVF clustering
|
This parameter is no longer used and is deprecated.
|
||||||
and quantization, which may improve the search accuracy. It's faster than
|
|
||||||
re-creating the index from scratch, so it's recommended to try this first,
|
|
||||||
when the data distribution has changed significantly.
|
|
||||||
|
|
||||||
Experimental API
|
Experimental API
|
||||||
----------------
|
----------------
|
||||||
@@ -4324,10 +4316,19 @@ class AsyncTable:
|
|||||||
cleanup_since_ms: Optional[int] = None
|
cleanup_since_ms: Optional[int] = None
|
||||||
if cleanup_older_than is not None:
|
if cleanup_older_than is not None:
|
||||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||||
|
|
||||||
|
if retrain:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The 'retrain' parameter is deprecated and will be removed in a "
|
||||||
|
"future version.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
return await self._inner.optimize(
|
return await self._inner.optimize(
|
||||||
cleanup_since_ms=cleanup_since_ms,
|
cleanup_since_ms=cleanup_since_ms,
|
||||||
delete_unverified=delete_unverified,
|
delete_unverified=delete_unverified,
|
||||||
retrain=retrain,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from lancedb.rerankers import (
|
|||||||
JinaReranker,
|
JinaReranker,
|
||||||
AnswerdotaiRerankers,
|
AnswerdotaiRerankers,
|
||||||
VoyageAIReranker,
|
VoyageAIReranker,
|
||||||
|
MRRReranker,
|
||||||
)
|
)
|
||||||
from lancedb.table import LanceTable
|
from lancedb.table import LanceTable
|
||||||
|
|
||||||
@@ -46,6 +47,7 @@ def get_test_table(tmp_path, use_tantivy):
|
|||||||
db,
|
db,
|
||||||
"my_table",
|
"my_table",
|
||||||
schema=MyTable,
|
schema=MyTable,
|
||||||
|
mode="overwrite",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Need to test with a bunch of phrases to make sure sorting is consistent
|
# Need to test with a bunch of phrases to make sure sorting is consistent
|
||||||
@@ -96,7 +98,7 @@ def get_test_table(tmp_path, use_tantivy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create a fts index
|
# Create a fts index
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||||
|
|
||||||
return table, MyTable
|
return table, MyTable
|
||||||
|
|
||||||
@@ -320,6 +322,34 @@ def test_rrf_reranker(tmp_path, use_tantivy):
|
|||||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
|
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||||
|
reranker = MRRReranker()
|
||||||
|
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||||
|
|
||||||
|
# Test multi-vector part
|
||||||
|
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||||
|
query = "single player experience"
|
||||||
|
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||||
|
rs2 = (
|
||||||
|
table.search(query, vector_column_name="meta_vector")
|
||||||
|
.limit(10)
|
||||||
|
.with_row_id(True)
|
||||||
|
)
|
||||||
|
result = reranker.rerank_multivector([rs1, rs2])
|
||||||
|
assert "_relevance_score" in result.column_names
|
||||||
|
assert len(result) <= 20
|
||||||
|
|
||||||
|
if len(result) > 1:
|
||||||
|
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
|
||||||
|
"The _relevance_score should be descending."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with duplicate results
|
||||||
|
result_deduped = reranker.rerank_multivector([rs1, rs2, rs1])
|
||||||
|
assert len(result_deduped) == len(result)
|
||||||
|
|
||||||
|
|
||||||
def test_rrf_reranker_distance():
|
def test_rrf_reranker_distance():
|
||||||
data = pa.table(
|
data = pa.table(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -591,12 +591,11 @@ impl Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||||
pub fn optimize(
|
pub fn optimize(
|
||||||
self_: PyRef<'_, Self>,
|
self_: PyRef<'_, Self>,
|
||||||
cleanup_since_ms: Option<u64>,
|
cleanup_since_ms: Option<u64>,
|
||||||
delete_unverified: Option<bool>,
|
delete_unverified: Option<bool>,
|
||||||
retrain: Option<bool>,
|
|
||||||
) -> PyResult<Bound<'_, PyAny>> {
|
) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||||
@@ -632,10 +631,9 @@ impl Table {
|
|||||||
.prune
|
.prune
|
||||||
.unwrap();
|
.unwrap();
|
||||||
inner
|
inner
|
||||||
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
|
.optimize(lancedb::table::OptimizeAction::Index(
|
||||||
Some(true) => OptimizeOptions::retrain(),
|
OptimizeOptions::default(),
|
||||||
_ => OptimizeOptions::default(),
|
))
|
||||||
}))
|
|
||||||
.await
|
.await
|
||||||
.infer_error()?;
|
.infer_error()?;
|
||||||
Ok(OptimizeStats {
|
Ok(OptimizeStats {
|
||||||
@@ -674,6 +672,9 @@ impl Table {
|
|||||||
if let Some(timeout) = parameters.timeout {
|
if let Some(timeout) = parameters.timeout {
|
||||||
builder.timeout(timeout);
|
builder.timeout(timeout);
|
||||||
}
|
}
|
||||||
|
if let Some(use_index) = parameters.use_index {
|
||||||
|
builder.use_index(use_index);
|
||||||
|
}
|
||||||
|
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
||||||
@@ -833,6 +834,7 @@ pub struct MergeInsertParams {
|
|||||||
when_not_matched_by_source_delete: bool,
|
when_not_matched_by_source_delete: bool,
|
||||||
when_not_matched_by_source_condition: Option<String>,
|
when_not_matched_by_source_condition: Option<String>,
|
||||||
timeout: Option<std::time::Duration>,
|
timeout: Option<std::time::Duration>,
|
||||||
|
use_index: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.22.1-beta.2"
|
version = "0.22.1"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -36,6 +36,7 @@ lance-table = { workspace = true }
|
|||||||
lance-linalg = { workspace = true }
|
lance-linalg = { workspace = true }
|
||||||
lance-testing = { workspace = true }
|
lance-testing = { workspace = true }
|
||||||
lance-encoding = { workspace = true }
|
lance-encoding = { workspace = true }
|
||||||
|
lance-namespace = { workspace = true }
|
||||||
moka = { workspace = true }
|
moka = { workspace = true }
|
||||||
pin-project = { workspace = true }
|
pin-project = { workspace = true }
|
||||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||||
|
|||||||
@@ -1015,6 +1015,117 @@ pub fn connect(uri: &str) -> ConnectBuilder {
|
|||||||
ConnectBuilder::new(uri)
|
ConnectBuilder::new(uri)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct ConnectNamespaceBuilder {
|
||||||
|
ns_impl: String,
|
||||||
|
properties: HashMap<String, String>,
|
||||||
|
storage_options: HashMap<String, String>,
|
||||||
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
|
embedding_registry: Option<Arc<dyn EmbeddingRegistry>>,
|
||||||
|
session: Option<Arc<lance::session::Session>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ConnectNamespaceBuilder {
|
||||||
|
fn new(ns_impl: &str, properties: HashMap<String, String>) -> Self {
|
||||||
|
Self {
|
||||||
|
ns_impl: ns_impl.to_string(),
|
||||||
|
properties,
|
||||||
|
storage_options: HashMap::new(),
|
||||||
|
read_consistency_interval: None,
|
||||||
|
embedding_registry: None,
|
||||||
|
session: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set an option for the storage layer.
|
||||||
|
///
|
||||||
|
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
||||||
|
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
||||||
|
self.storage_options.insert(key.into(), value.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set multiple options for the storage layer.
|
||||||
|
///
|
||||||
|
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
||||||
|
pub fn storage_options(
|
||||||
|
mut self,
|
||||||
|
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
|
||||||
|
) -> Self {
|
||||||
|
for (key, value) in pairs {
|
||||||
|
self.storage_options.insert(key.into(), value.into());
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The interval at which to check for updates from other processes.
|
||||||
|
///
|
||||||
|
/// If left unset, consistency is not checked. For maximum read
|
||||||
|
/// performance, this is the default. For strong consistency, set this to
|
||||||
|
/// zero seconds. Then every read will check for updates from other processes.
|
||||||
|
/// As a compromise, set this to a non-zero duration for eventual consistency.
|
||||||
|
pub fn read_consistency_interval(
|
||||||
|
mut self,
|
||||||
|
read_consistency_interval: std::time::Duration,
|
||||||
|
) -> Self {
|
||||||
|
self.read_consistency_interval = Some(read_consistency_interval);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provide a custom [`EmbeddingRegistry`] to use for this connection.
|
||||||
|
pub fn embedding_registry(mut self, registry: Arc<dyn EmbeddingRegistry>) -> Self {
|
||||||
|
self.embedding_registry = Some(registry);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set a custom session for object stores and caching.
|
||||||
|
///
|
||||||
|
/// By default, a new session with default configuration will be created.
|
||||||
|
/// This method allows you to provide a custom session with your own
|
||||||
|
/// configuration for object store registries, caching, etc.
|
||||||
|
pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
|
||||||
|
self.session = Some(session);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute the connection
|
||||||
|
pub async fn execute(self) -> Result<Connection> {
|
||||||
|
use crate::database::namespace::LanceNamespaceDatabase;
|
||||||
|
|
||||||
|
let internal = Arc::new(
|
||||||
|
LanceNamespaceDatabase::connect(
|
||||||
|
&self.ns_impl,
|
||||||
|
self.properties,
|
||||||
|
self.storage_options,
|
||||||
|
self.read_consistency_interval,
|
||||||
|
self.session,
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Connection {
|
||||||
|
internal,
|
||||||
|
uri: format!("namespace://{}", self.ns_impl),
|
||||||
|
embedding_registry: self
|
||||||
|
.embedding_registry
|
||||||
|
.unwrap_or_else(|| Arc::new(MemoryRegistry::new())),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Connect to a LanceDB database through a namespace.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `ns_impl` - The namespace implementation to use (e.g., "dir" for directory-based, "rest" for REST API)
|
||||||
|
/// * `properties` - Configuration properties for the namespace implementation
|
||||||
|
/// ```
|
||||||
|
pub fn connect_namespace(
|
||||||
|
ns_impl: &str,
|
||||||
|
properties: HashMap<String, String>,
|
||||||
|
) -> ConnectNamespaceBuilder {
|
||||||
|
ConnectNamespaceBuilder::new(ns_impl, properties)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "remote"))]
|
#[cfg(all(test, feature = "remote"))]
|
||||||
mod test_utils {
|
mod test_utils {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ use crate::error::Result;
|
|||||||
use crate::table::{BaseTable, TableDefinition, WriteOptions};
|
use crate::table::{BaseTable, TableDefinition, WriteOptions};
|
||||||
|
|
||||||
pub mod listing;
|
pub mod listing;
|
||||||
|
pub mod namespace;
|
||||||
|
|
||||||
pub trait DatabaseOptions {
|
pub trait DatabaseOptions {
|
||||||
fn serialize_into_map(&self, map: &mut HashMap<String, String>);
|
fn serialize_into_map(&self, map: &mut HashMap<String, String>);
|
||||||
|
|||||||
840
rust/lancedb/src/database/namespace.rs
Normal file
840
rust/lancedb/src/database/namespace.rs
Normal file
@@ -0,0 +1,840 @@
|
|||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
|
|
||||||
|
//! Namespace-based database implementation that delegates table management to lance-namespace
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use lance_namespace::{
|
||||||
|
connect as connect_namespace,
|
||||||
|
models::{
|
||||||
|
CreateEmptyTableRequest, CreateNamespaceRequest, DescribeTableRequest,
|
||||||
|
DropNamespaceRequest, DropTableRequest, ListNamespacesRequest, ListTablesRequest,
|
||||||
|
},
|
||||||
|
LanceNamespace,
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::connection::ConnectRequest;
|
||||||
|
use crate::database::listing::ListingDatabase;
|
||||||
|
use crate::error::{Error, Result};
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
BaseTable, CloneTableRequest, CreateNamespaceRequest as DbCreateNamespaceRequest,
|
||||||
|
CreateTableMode, CreateTableRequest as DbCreateTableRequest, Database,
|
||||||
|
DropNamespaceRequest as DbDropNamespaceRequest,
|
||||||
|
ListNamespacesRequest as DbListNamespacesRequest, OpenTableRequest, TableNamesRequest,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// A database implementation that uses lance-namespace for table management
|
||||||
|
pub struct LanceNamespaceDatabase {
|
||||||
|
namespace: Arc<dyn LanceNamespace>,
|
||||||
|
// Storage options to be inherited by tables
|
||||||
|
storage_options: HashMap<String, String>,
|
||||||
|
// Read consistency interval for tables
|
||||||
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
|
// Optional session for object stores and caching
|
||||||
|
session: Option<Arc<lance::session::Session>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LanceNamespaceDatabase {
|
||||||
|
pub async fn connect(
|
||||||
|
ns_impl: &str,
|
||||||
|
ns_properties: HashMap<String, String>,
|
||||||
|
storage_options: HashMap<String, String>,
|
||||||
|
read_consistency_interval: Option<std::time::Duration>,
|
||||||
|
session: Option<Arc<lance::session::Session>>,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let namespace = connect_namespace(ns_impl, ns_properties.clone())
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::InvalidInput {
|
||||||
|
message: format!("Failed to connect to namespace: {:?}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
namespace,
|
||||||
|
storage_options,
|
||||||
|
read_consistency_interval,
|
||||||
|
session,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper method to create a ListingDatabase from a table location
|
||||||
|
///
|
||||||
|
/// This method:
|
||||||
|
/// 1. Validates that the location ends with <table_name>.lance
|
||||||
|
/// 2. Extracts the parent directory from the location
|
||||||
|
/// 3. Creates a ListingDatabase at that parent directory
|
||||||
|
async fn create_listing_database(
|
||||||
|
&self,
|
||||||
|
table_name: &str,
|
||||||
|
location: &str,
|
||||||
|
additional_storage_options: Option<HashMap<String, String>>,
|
||||||
|
) -> Result<Arc<ListingDatabase>> {
|
||||||
|
let expected_suffix = format!("{}.lance", table_name);
|
||||||
|
if !location.ends_with(&expected_suffix) {
|
||||||
|
return Err(Error::Runtime {
|
||||||
|
message: format!(
|
||||||
|
"Invalid table location '{}': expected to end with '{}'",
|
||||||
|
location, expected_suffix
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let parent_dir = location
|
||||||
|
.rsplit_once('/')
|
||||||
|
.map(|(parent, _)| parent.to_string())
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: format!("Invalid table location '{}': no parent directory", location),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut merged_storage_options = self.storage_options.clone();
|
||||||
|
if let Some(opts) = additional_storage_options {
|
||||||
|
merged_storage_options.extend(opts);
|
||||||
|
}
|
||||||
|
|
||||||
|
let connect_request = ConnectRequest {
|
||||||
|
uri: parent_dir,
|
||||||
|
options: merged_storage_options,
|
||||||
|
read_consistency_interval: self.read_consistency_interval,
|
||||||
|
session: self.session.clone(),
|
||||||
|
#[cfg(feature = "remote")]
|
||||||
|
client_config: Default::default(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let listing_db = ListingDatabase::connect_with_options(&connect_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to create listing database: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(Arc::new(listing_db))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for LanceNamespaceDatabase {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("LanceNamespaceDatabase")
|
||||||
|
.field("storage_options", &self.storage_options)
|
||||||
|
.field("read_consistency_interval", &self.read_consistency_interval)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for LanceNamespaceDatabase {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "LanceNamespaceDatabase")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Database for LanceNamespaceDatabase {
|
||||||
|
async fn list_namespaces(&self, request: DbListNamespacesRequest) -> Result<Vec<String>> {
|
||||||
|
let ns_request = ListNamespacesRequest {
|
||||||
|
id: if request.namespace.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(request.namespace)
|
||||||
|
},
|
||||||
|
page_token: request.page_token,
|
||||||
|
limit: request.limit.map(|l| l as i32),
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.namespace
|
||||||
|
.list_namespaces(ns_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to list namespaces: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(response.namespaces)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_namespace(&self, request: DbCreateNamespaceRequest) -> Result<()> {
|
||||||
|
let ns_request = CreateNamespaceRequest {
|
||||||
|
id: if request.namespace.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(request.namespace)
|
||||||
|
},
|
||||||
|
mode: None,
|
||||||
|
properties: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.namespace
|
||||||
|
.create_namespace(ns_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to create namespace: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn drop_namespace(&self, request: DbDropNamespaceRequest) -> Result<()> {
|
||||||
|
let ns_request = DropNamespaceRequest {
|
||||||
|
id: if request.namespace.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(request.namespace)
|
||||||
|
},
|
||||||
|
mode: None,
|
||||||
|
behavior: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.namespace
|
||||||
|
.drop_namespace(ns_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to drop namespace: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
|
||||||
|
let ns_request = ListTablesRequest {
|
||||||
|
id: if request.namespace.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(request.namespace)
|
||||||
|
},
|
||||||
|
page_token: request.start_after,
|
||||||
|
limit: request.limit.map(|l| l as i32),
|
||||||
|
};
|
||||||
|
|
||||||
|
let response =
|
||||||
|
self.namespace
|
||||||
|
.list_tables(ns_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to list tables: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(response.tables)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_table(&self, request: DbCreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
|
let mut table_id = request.namespace.clone();
|
||||||
|
table_id.push(request.name.clone());
|
||||||
|
let describe_request = DescribeTableRequest {
|
||||||
|
id: Some(table_id.clone()),
|
||||||
|
version: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let describe_result = self.namespace.describe_table(describe_request).await;
|
||||||
|
|
||||||
|
match request.mode {
|
||||||
|
CreateTableMode::Create => {
|
||||||
|
if describe_result.is_ok() {
|
||||||
|
return Err(Error::TableAlreadyExists {
|
||||||
|
name: request.name.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CreateTableMode::Overwrite => {
|
||||||
|
if describe_result.is_ok() {
|
||||||
|
// Drop the existing table - must succeed
|
||||||
|
let drop_request = DropTableRequest {
|
||||||
|
id: Some(table_id.clone()),
|
||||||
|
};
|
||||||
|
self.namespace
|
||||||
|
.drop_table(drop_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to drop existing table for overwrite: {}", e),
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CreateTableMode::ExistOk(_) => {
|
||||||
|
if let Ok(response) = describe_result {
|
||||||
|
let location = response.location.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Table location is missing from namespace response".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let listing_db = self
|
||||||
|
.create_listing_database(&request.name, &location, response.storage_options)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
return listing_db
|
||||||
|
.open_table(OpenTableRequest {
|
||||||
|
name: request.name.clone(),
|
||||||
|
namespace: request.namespace.clone(),
|
||||||
|
index_cache_size: None,
|
||||||
|
lance_read_params: None,
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut table_id = request.namespace.clone();
|
||||||
|
table_id.push(request.name.clone());
|
||||||
|
|
||||||
|
let create_empty_request = CreateEmptyTableRequest {
|
||||||
|
id: Some(table_id),
|
||||||
|
location: None,
|
||||||
|
properties: if self.storage_options.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.storage_options.clone())
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let create_empty_response = self
|
||||||
|
.namespace
|
||||||
|
.create_empty_table(create_empty_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to create empty table: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let location = create_empty_response
|
||||||
|
.location
|
||||||
|
.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Table location is missing from create_empty_table response".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let listing_db = self
|
||||||
|
.create_listing_database(
|
||||||
|
&request.name,
|
||||||
|
&location,
|
||||||
|
create_empty_response.storage_options,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
listing_db.create_table(request).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
|
let mut table_id = request.namespace.clone();
|
||||||
|
table_id.push(request.name.clone());
|
||||||
|
|
||||||
|
let describe_request = DescribeTableRequest {
|
||||||
|
id: Some(table_id),
|
||||||
|
version: None,
|
||||||
|
};
|
||||||
|
let response = self
|
||||||
|
.namespace
|
||||||
|
.describe_table(describe_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to describe table: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let location = response.location.ok_or_else(|| Error::Runtime {
|
||||||
|
message: "Table location is missing from namespace response".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let listing_db = self
|
||||||
|
.create_listing_database(&request.name, &location, response.storage_options)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
listing_db.open_table(request).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn clone_table(&self, _request: CloneTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||||
|
Err(Error::NotSupported {
|
||||||
|
message: "clone_table is not supported for namespace connections".to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn rename_table(
|
||||||
|
&self,
|
||||||
|
_cur_name: &str,
|
||||||
|
_new_name: &str,
|
||||||
|
_cur_namespace: &[String],
|
||||||
|
_new_namespace: &[String],
|
||||||
|
) -> Result<()> {
|
||||||
|
Err(Error::NotSupported {
|
||||||
|
message: "rename_table is not supported for namespace connections".to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()> {
|
||||||
|
let mut table_id = namespace.to_vec();
|
||||||
|
table_id.push(name.to_string());
|
||||||
|
|
||||||
|
let drop_request = DropTableRequest { id: Some(table_id) };
|
||||||
|
self.namespace
|
||||||
|
.drop_table(drop_request)
|
||||||
|
.await
|
||||||
|
.map_err(|e| Error::Runtime {
|
||||||
|
message: format!("Failed to drop table: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
|
||||||
|
let tables = self
|
||||||
|
.table_names(TableNamesRequest {
|
||||||
|
namespace: namespace.to_vec(),
|
||||||
|
start_after: None,
|
||||||
|
limit: None,
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for table in tables {
|
||||||
|
self.drop_table(&table, namespace).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_any(&self) -> &dyn std::any::Any {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[cfg(not(windows))] // TODO: support windows for lance-namespace
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::connect_namespace;
|
||||||
|
use crate::query::ExecutableQuery;
|
||||||
|
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
||||||
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use futures::TryStreamExt;
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
/// Helper function to create test data
|
||||||
|
fn create_test_data() -> RecordBatchIterator<
|
||||||
|
std::vec::IntoIter<std::result::Result<RecordBatch, arrow_schema::ArrowError>>,
|
||||||
|
> {
|
||||||
|
let schema = Arc::new(Schema::new(vec![
|
||||||
|
Field::new("id", DataType::Int32, false),
|
||||||
|
Field::new("name", DataType::Utf8, false),
|
||||||
|
]));
|
||||||
|
|
||||||
|
let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
|
||||||
|
let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie", "David", "Eve"]);
|
||||||
|
|
||||||
|
let batch = RecordBatch::try_new(
|
||||||
|
schema.clone(),
|
||||||
|
vec![Arc::new(id_array), Arc::new(name_array)],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
RecordBatchIterator::new(vec![std::result::Result::Ok(batch)].into_iter(), schema)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_connection_simple() {
|
||||||
|
// Test that namespace connections work with simple connect_namespace(impl_type, properties)
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
// This should succeed with directory-based namespace
|
||||||
|
let result = connect_namespace("dir", properties).execute().await;
|
||||||
|
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_connection_with_storage_options() {
|
||||||
|
// Test namespace connections with storage options
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
// This should succeed with directory-based namespace and storage options
|
||||||
|
let result = connect_namespace("dir", properties)
|
||||||
|
.storage_option("timeout", "30s")
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_connection_with_all_options() {
|
||||||
|
use crate::embeddings::MemoryRegistry;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
// Test namespace connections with all configuration options
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let embedding_registry = Arc::new(MemoryRegistry::new());
|
||||||
|
let session = Arc::new(lance::session::Session::default());
|
||||||
|
|
||||||
|
// Test with all options set
|
||||||
|
let result = connect_namespace("dir", properties)
|
||||||
|
.storage_option("timeout", "30s")
|
||||||
|
.storage_options([("cache_size", "1gb"), ("region", "us-east-1")])
|
||||||
|
.read_consistency_interval(Duration::from_secs(5))
|
||||||
|
.embedding_registry(embedding_registry.clone())
|
||||||
|
.session(session.clone())
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
assert!(result.is_ok());
|
||||||
|
|
||||||
|
let conn = result.unwrap();
|
||||||
|
|
||||||
|
// Verify embedding registry is set correctly
|
||||||
|
assert!(std::ptr::eq(
|
||||||
|
conn.embedding_registry() as *const _,
|
||||||
|
embedding_registry.as_ref() as *const _
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_create_table_basic() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
// Connect to namespace using DirectoryNamespace
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Test: Create a table
|
||||||
|
let test_data = create_test_data();
|
||||||
|
let table = conn
|
||||||
|
.create_table("test_table", test_data)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create table");
|
||||||
|
|
||||||
|
// Verify: Table was created and can be queried
|
||||||
|
let results = table
|
||||||
|
.query()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to query table")
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.expect("Failed to collect results");
|
||||||
|
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].num_rows(), 5);
|
||||||
|
|
||||||
|
// Verify: Table appears in table_names
|
||||||
|
let table_names = conn
|
||||||
|
.table_names()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to list tables");
|
||||||
|
assert!(table_names.contains(&"test_table".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_describe_table() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
// Connect to namespace
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Create a table first
|
||||||
|
let test_data = create_test_data();
|
||||||
|
let _table = conn
|
||||||
|
.create_table("describe_test", test_data)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create table");
|
||||||
|
|
||||||
|
// Test: Open the table (which internally uses describe_table)
|
||||||
|
let opened_table = conn
|
||||||
|
.open_table("describe_test")
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to open table");
|
||||||
|
|
||||||
|
// Verify: Can query the opened table
|
||||||
|
let results = opened_table
|
||||||
|
.query()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to query table")
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.expect("Failed to collect results");
|
||||||
|
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].num_rows(), 5);
|
||||||
|
|
||||||
|
// Verify schema matches
|
||||||
|
let schema = opened_table.schema().await.expect("Failed to get schema");
|
||||||
|
assert_eq!(schema.fields.len(), 2);
|
||||||
|
assert_eq!(schema.field(0).name(), "id");
|
||||||
|
assert_eq!(schema.field(1).name(), "name");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_create_table_overwrite_mode() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Create initial table with 5 rows
|
||||||
|
let test_data1 = create_test_data();
|
||||||
|
let _table1 = conn
|
||||||
|
.create_table("overwrite_test", test_data1)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create table");
|
||||||
|
|
||||||
|
// Create new data with 3 rows
|
||||||
|
let schema = Arc::new(Schema::new(vec![
|
||||||
|
Field::new("id", DataType::Int32, false),
|
||||||
|
Field::new("name", DataType::Utf8, false),
|
||||||
|
]));
|
||||||
|
let id_array = Int32Array::from(vec![10, 20, 30]);
|
||||||
|
let name_array = StringArray::from(vec!["New1", "New2", "New3"]);
|
||||||
|
let test_data2 = RecordBatch::try_new(
|
||||||
|
schema.clone(),
|
||||||
|
vec![Arc::new(id_array), Arc::new(name_array)],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Test: Overwrite the table
|
||||||
|
let table2 = conn
|
||||||
|
.create_table(
|
||||||
|
"overwrite_test",
|
||||||
|
RecordBatchIterator::new(
|
||||||
|
vec![std::result::Result::Ok(test_data2)].into_iter(),
|
||||||
|
schema,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.mode(CreateTableMode::Overwrite)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to overwrite table");
|
||||||
|
|
||||||
|
// Verify: Table has new data (3 rows instead of 5)
|
||||||
|
let results = table2
|
||||||
|
.query()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to query table")
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.expect("Failed to collect results");
|
||||||
|
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].num_rows(), 3);
|
||||||
|
|
||||||
|
// Verify the data is actually the new data
|
||||||
|
let id_col = results[0]
|
||||||
|
.column(0)
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<Int32Array>()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(id_col.value(0), 10);
|
||||||
|
assert_eq!(id_col.value(1), 20);
|
||||||
|
assert_eq!(id_col.value(2), 30);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_create_table_exist_ok_mode() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Create initial table with test data
|
||||||
|
let test_data1 = create_test_data();
|
||||||
|
let _table1 = conn
|
||||||
|
.create_table("exist_ok_test", test_data1)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create table");
|
||||||
|
|
||||||
|
// Try to create again with exist_ok mode
|
||||||
|
let test_data2 = create_test_data();
|
||||||
|
let table2 = conn
|
||||||
|
.create_table("exist_ok_test", test_data2)
|
||||||
|
.mode(CreateTableMode::exist_ok(|req| req))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed with exist_ok mode");
|
||||||
|
|
||||||
|
// Verify: Table still has original data (5 rows)
|
||||||
|
let results = table2
|
||||||
|
.query()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to query table")
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
.expect("Failed to collect results");
|
||||||
|
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].num_rows(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_create_multiple_tables() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Create first table
|
||||||
|
let test_data1 = create_test_data();
|
||||||
|
let _table1 = conn
|
||||||
|
.create_table("table1", test_data1)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create first table");
|
||||||
|
|
||||||
|
// Create second table
|
||||||
|
let test_data2 = create_test_data();
|
||||||
|
let _table2 = conn
|
||||||
|
.create_table("table2", test_data2)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create second table");
|
||||||
|
|
||||||
|
// Verify: Both tables appear in table list
|
||||||
|
let table_names = conn
|
||||||
|
.table_names()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to list tables");
|
||||||
|
|
||||||
|
assert!(table_names.contains(&"table1".to_string()));
|
||||||
|
assert!(table_names.contains(&"table2".to_string()));
|
||||||
|
|
||||||
|
// Verify: Can open both tables
|
||||||
|
let opened_table1 = conn
|
||||||
|
.open_table("table1")
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to open table1");
|
||||||
|
|
||||||
|
let opened_table2 = conn
|
||||||
|
.open_table("table2")
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to open table2");
|
||||||
|
|
||||||
|
// Verify both tables work
|
||||||
|
let count1 = opened_table1
|
||||||
|
.count_rows(None)
|
||||||
|
.await
|
||||||
|
.expect("Failed to count rows in table1");
|
||||||
|
assert_eq!(count1, 5);
|
||||||
|
|
||||||
|
let count2 = opened_table2
|
||||||
|
.count_rows(None)
|
||||||
|
.await
|
||||||
|
.expect("Failed to count rows in table2");
|
||||||
|
assert_eq!(count2, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_table_not_found() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Test: Try to open a non-existent table
|
||||||
|
let result = conn.open_table("non_existent_table").execute().await;
|
||||||
|
|
||||||
|
// Verify: Should return an error
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_namespace_drop_table() {
|
||||||
|
// Setup: Create a temporary directory for the namespace
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
properties.insert("root".to_string(), root_path);
|
||||||
|
|
||||||
|
let conn = connect_namespace("dir", properties)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to connect to namespace");
|
||||||
|
|
||||||
|
// Create a table first
|
||||||
|
let test_data = create_test_data();
|
||||||
|
let _table = conn
|
||||||
|
.create_table("drop_test", test_data)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to create table");
|
||||||
|
|
||||||
|
// Verify table exists
|
||||||
|
let table_names_before = conn
|
||||||
|
.table_names()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to list tables");
|
||||||
|
assert!(table_names_before.contains(&"drop_test".to_string()));
|
||||||
|
|
||||||
|
// Test: Drop the table
|
||||||
|
conn.drop_table("drop_test", &[])
|
||||||
|
.await
|
||||||
|
.expect("Failed to drop table");
|
||||||
|
|
||||||
|
// Verify: Table no longer exists
|
||||||
|
let table_names_after = conn
|
||||||
|
.table_names()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.expect("Failed to list tables");
|
||||||
|
assert!(!table_names_after.contains(&"drop_test".to_string()));
|
||||||
|
|
||||||
|
// Verify: Cannot open dropped table
|
||||||
|
let open_result = conn.open_table("drop_test").execute().await;
|
||||||
|
assert!(open_result.is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
//! values
|
//! values
|
||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
|
|
||||||
use lance::table::format::{Index, Manifest};
|
use lance::table::format::{IndexMetadata, Manifest};
|
||||||
|
|
||||||
use crate::DistanceType;
|
use crate::DistanceType;
|
||||||
|
|
||||||
@@ -19,7 +19,7 @@ pub struct VectorIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl VectorIndex {
|
impl VectorIndex {
|
||||||
pub fn new_from_format(manifest: &Manifest, index: &Index) -> Self {
|
pub fn new_from_format(manifest: &Manifest, index: &IndexMetadata) -> Self {
|
||||||
let fields = index
|
let fields = index
|
||||||
.fields
|
.fields
|
||||||
.iter()
|
.iter()
|
||||||
|
|||||||
@@ -212,7 +212,7 @@ use std::fmt::Display;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub use connection::Connection;
|
pub use connection::{ConnectNamespaceBuilder, Connection};
|
||||||
pub use error::{Error, Result};
|
pub use error::{Error, Result};
|
||||||
use lance_linalg::distance::DistanceType as LanceDistanceType;
|
use lance_linalg::distance::DistanceType as LanceDistanceType;
|
||||||
pub use table::Table;
|
pub use table::Table;
|
||||||
@@ -289,6 +289,8 @@ impl Display for DistanceType {
|
|||||||
|
|
||||||
/// Connect to a database
|
/// Connect to a database
|
||||||
pub use connection::connect;
|
pub use connection::connect;
|
||||||
|
/// Connect to a namespace-backed database
|
||||||
|
pub use connection::connect_namespace;
|
||||||
|
|
||||||
/// Re-export Lance Session and ObjectStoreRegistry for custom session creation
|
/// Re-export Lance Session and ObjectStoreRegistry for custom session creation
|
||||||
pub use lance::session::Session;
|
pub use lance::session::Session;
|
||||||
|
|||||||
@@ -1452,6 +1452,14 @@ struct MergeInsertRequest {
|
|||||||
when_not_matched_insert_all: bool,
|
when_not_matched_insert_all: bool,
|
||||||
when_not_matched_by_source_delete: bool,
|
when_not_matched_by_source_delete: bool,
|
||||||
when_not_matched_by_source_delete_filt: Option<String>,
|
when_not_matched_by_source_delete_filt: Option<String>,
|
||||||
|
// For backwards compatibility, only serialize use_index when it's false
|
||||||
|
// (the default is true)
|
||||||
|
#[serde(skip_serializing_if = "is_true")]
|
||||||
|
use_index: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_true(b: &bool) -> bool {
|
||||||
|
*b
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
|
impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
|
||||||
@@ -1476,6 +1484,8 @@ impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
|
|||||||
when_not_matched_insert_all: value.when_not_matched_insert_all,
|
when_not_matched_insert_all: value.when_not_matched_insert_all,
|
||||||
when_not_matched_by_source_delete: value.when_not_matched_by_source_delete,
|
when_not_matched_by_source_delete: value.when_not_matched_by_source_delete,
|
||||||
when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt,
|
when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt,
|
||||||
|
// Only serialize use_index when it's false for backwards compatibility
|
||||||
|
use_index: value.use_index,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1942,6 +1952,7 @@ mod tests {
|
|||||||
assert_eq!(params["when_not_matched_by_source_delete"], "false");
|
assert_eq!(params["when_not_matched_by_source_delete"], "false");
|
||||||
assert!(!params.contains_key("when_matched_update_all_filt"));
|
assert!(!params.contains_key("when_matched_update_all_filt"));
|
||||||
assert!(!params.contains_key("when_not_matched_by_source_delete_filt"));
|
assert!(!params.contains_key("when_not_matched_by_source_delete_filt"));
|
||||||
|
assert!(!params.contains_key("use_index"));
|
||||||
|
|
||||||
if old_server {
|
if old_server {
|
||||||
http::Response::builder().status(200).body("{}").unwrap()
|
http::Response::builder().status(200).body("{}").unwrap()
|
||||||
|
|||||||
@@ -1976,6 +1976,8 @@ impl NativeTable {
|
|||||||
/// Delete keys from the config
|
/// Delete keys from the config
|
||||||
pub async fn delete_config_keys(&self, delete_keys: &[&str]) -> Result<()> {
|
pub async fn delete_config_keys(&self, delete_keys: &[&str]) -> Result<()> {
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
|
// TODO: update this when we implement metadata APIs
|
||||||
|
#[allow(deprecated)]
|
||||||
dataset.delete_config_keys(delete_keys).await?;
|
dataset.delete_config_keys(delete_keys).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1986,6 +1988,8 @@ impl NativeTable {
|
|||||||
upsert_values: impl IntoIterator<Item = (String, String)>,
|
upsert_values: impl IntoIterator<Item = (String, String)>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
|
// TODO: update this when we implement metadata APIs
|
||||||
|
#[allow(deprecated)]
|
||||||
dataset.replace_schema_metadata(upsert_values).await?;
|
dataset.replace_schema_metadata(upsert_values).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -2395,6 +2399,7 @@ impl BaseTable for NativeTable {
|
|||||||
} else {
|
} else {
|
||||||
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
|
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
|
||||||
}
|
}
|
||||||
|
builder.use_index(params.use_index);
|
||||||
|
|
||||||
let future = if let Some(timeout) = params.timeout {
|
let future = if let Some(timeout) = params.timeout {
|
||||||
// The default retry timeout is 30s, so we pass the full timeout down
|
// The default retry timeout is 30s, so we pass the full timeout down
|
||||||
@@ -2902,6 +2907,38 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_merge_insert_use_index() {
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
let conn = connect(uri).execute().await.unwrap();
|
||||||
|
|
||||||
|
// Create a dataset with i=0..10
|
||||||
|
let batches = merge_insert_test_batches(0, 0);
|
||||||
|
let table = conn
|
||||||
|
.create_table("my_table", batches)
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(table.count_rows(None).await.unwrap(), 10);
|
||||||
|
|
||||||
|
// Test use_index=true (default behavior)
|
||||||
|
let new_batches = Box::new(merge_insert_test_batches(5, 1));
|
||||||
|
let mut merge_insert_builder = table.merge_insert(&["i"]);
|
||||||
|
merge_insert_builder.when_not_matched_insert_all();
|
||||||
|
merge_insert_builder.use_index(true);
|
||||||
|
merge_insert_builder.execute(new_batches).await.unwrap();
|
||||||
|
assert_eq!(table.count_rows(None).await.unwrap(), 15);
|
||||||
|
|
||||||
|
// Test use_index=false (force table scan)
|
||||||
|
let new_batches = Box::new(merge_insert_test_batches(15, 2));
|
||||||
|
let mut merge_insert_builder = table.merge_insert(&["i"]);
|
||||||
|
merge_insert_builder.when_not_matched_insert_all();
|
||||||
|
merge_insert_builder.use_index(false);
|
||||||
|
merge_insert_builder.execute(new_batches).await.unwrap();
|
||||||
|
assert_eq!(table.count_rows(None).await.unwrap(), 25);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_add_overwrite() {
|
async fn test_add_overwrite() {
|
||||||
let tmp_dir = tempdir().unwrap();
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ pub struct MergeInsertBuilder {
|
|||||||
pub(crate) when_not_matched_by_source_delete: bool,
|
pub(crate) when_not_matched_by_source_delete: bool,
|
||||||
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
|
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
|
||||||
pub(crate) timeout: Option<Duration>,
|
pub(crate) timeout: Option<Duration>,
|
||||||
|
pub(crate) use_index: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeInsertBuilder {
|
impl MergeInsertBuilder {
|
||||||
@@ -35,6 +36,7 @@ impl MergeInsertBuilder {
|
|||||||
when_not_matched_by_source_delete: false,
|
when_not_matched_by_source_delete: false,
|
||||||
when_not_matched_by_source_delete_filt: None,
|
when_not_matched_by_source_delete_filt: None,
|
||||||
timeout: None,
|
timeout: None,
|
||||||
|
use_index: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,6 +103,19 @@ impl MergeInsertBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Controls whether to use indexes for the merge operation.
|
||||||
|
///
|
||||||
|
/// When set to `true` (the default), the operation will use an index if available
|
||||||
|
/// on the join key for improved performance. When set to `false`, it forces a full
|
||||||
|
/// table scan even if an index exists. This can be useful for benchmarking or when
|
||||||
|
/// the query optimizer chooses a suboptimal path.
|
||||||
|
///
|
||||||
|
/// If not set, defaults to `true` (use index if available).
|
||||||
|
pub fn use_index(&mut self, use_index: bool) -> &mut Self {
|
||||||
|
self.use_index = use_index;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Executes the merge insert operation
|
/// Executes the merge insert operation
|
||||||
///
|
///
|
||||||
/// Returns version and statistics about the merge operation including the number of rows
|
/// Returns version and statistics about the merge operation including the number of rows
|
||||||
|
|||||||
Reference in New Issue
Block a user