Compare commits

...

7 Commits

Author SHA1 Message Date
Lance Release
5e1e9add07 Bump version: 0.25.1-beta.1 → 0.25.1-beta.2 2025-09-18 20:21:33 +00:00
Jack Ye
97e9938dfe fix: add missing validations to namespace operations (#2659) 2025-09-17 23:27:04 -07:00
Weston Pace
1d4b92e01e refactor: remove catalog implementation now that we have namespaces in database (#2662)
We had previously prototyped a `Catalog` trait anticipating a
three-tiered Catalog-Database-Table structure. Now that we have
namespaces in the `Database` we can support any tiering scheme and the
`Catalog` trait is no longer needed.
2025-09-17 08:40:20 -07:00
Le Duc Manh
4c9fc3044b fix: use create to resolve variables (#2640)
# What
- Use `create` to resolve variables values

# Reference
Fixes #2181

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2025-09-12 13:07:32 -07:00
Jack Ye
0ebc8d45a8 chore: fix no lock build warnings and CI timeouts (#2650)
Example CI failures:
- publish build timeout:
https://github.com/lancedb/lancedb/actions/runs/17626482881/job/50084552906
- doc test build timeout:
https://github.com/lancedb/lancedb/actions/runs/17627058590/job/50086456818
2025-09-11 15:30:35 -07:00
BubbleCal
f7d78c3420 feat: add 'target_partition_size' param (#2642)
this exposes the param `target_partition_size` from lance

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-09-11 22:56:16 +08:00
Lance Release
6ea6884260 Bump version: 0.22.1-beta.0 → 0.22.1-beta.1 2025-09-10 20:49:43 +00:00
42 changed files with 330 additions and 896 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.22.1-beta.0" current_version = "0.22.1-beta.1"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -24,7 +24,8 @@ env:
jobs: jobs:
test-python: test-python:
name: Test doc python code name: Test doc python code
runs-on: ubuntu-24.04 runs-on: warp-ubuntu-2204-x64-8x
timeout-minutes: 60
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -48,7 +49,6 @@ jobs:
uses: swatinem/rust-cache@v2 uses: swatinem/rust-cache@v2
- name: Build Python - name: Build Python
working-directory: docs/test working-directory: docs/test
timeout-minutes: 60
run: run:
python -m pip install --extra-index-url https://pypi.fury.io/lancedb/ -r requirements.txt python -m pip install --extra-index-url https://pypi.fury.io/lancedb/ -r requirements.txt
- name: Create test files - name: Create test files

16
Cargo.lock generated
View File

@@ -713,9 +713,9 @@ dependencies = [
[[package]] [[package]]
name = "aws-sdk-s3" name = "aws-sdk-s3"
version = "1.104.0" version = "1.105.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38c488cd6abb0ec9811c401894191932e941c5f91dc226043edacd0afa1634bc" checksum = "c99789e929b5e1d9a5aa3fa1d81317f3a789afc796141d11b0eaafd9d9f47e38"
dependencies = [ dependencies = [
"aws-credential-types", "aws-credential-types",
"aws-runtime", "aws-runtime",
@@ -963,9 +963,9 @@ dependencies = [
[[package]] [[package]]
name = "aws-smithy-runtime" name = "aws-smithy-runtime"
version = "1.9.1" version = "1.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3946acbe1ead1301ba6862e712c7903ca9bb230bdf1fbd1b5ac54158ef2ab1f" checksum = "4fa63ad37685ceb7762fa4d73d06f1d5493feb88e3f27259b9ed277f4c01b185"
dependencies = [ dependencies = [
"aws-smithy-async", "aws-smithy-async",
"aws-smithy-http", "aws-smithy-http",
@@ -1153,7 +1153,7 @@ dependencies = [
"bitflags 2.9.4", "bitflags 2.9.4",
"cexpr", "cexpr",
"clang-sys", "clang-sys",
"itertools 0.12.1", "itertools 0.11.0",
"lazy_static", "lazy_static",
"lazycell", "lazycell",
"log", "log",
@@ -4628,7 +4628,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb" name = "lancedb"
version = "0.22.1-beta.0" version = "0.22.1-beta.1"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -4715,7 +4715,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-nodejs" name = "lancedb-nodejs"
version = "0.22.1-beta.0" version = "0.22.1-beta.1"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-ipc", "arrow-ipc",
@@ -4735,7 +4735,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-python" name = "lancedb-python"
version = "0.25.1-beta.0" version = "0.25.1-beta.1"
dependencies = [ dependencies = [
"arrow", "arrow",
"async-trait", "async-trait",

View File

@@ -16,6 +16,7 @@ pub trait JNIEnvExt {
fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>>; fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>>;
/// Get strings from Java List<String> object. /// Get strings from Java List<String> object.
#[allow(dead_code)]
fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>>; fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>>;
/// Get strings from Java String[] object. /// Get strings from Java String[] object.

View File

@@ -6,6 +6,7 @@ use jni::JNIEnv;
use crate::Result; use crate::Result;
#[allow(dead_code)]
pub trait FromJObject<T> { pub trait FromJObject<T> {
fn extract(&self) -> Result<T>; fn extract(&self) -> Result<T>;
} }
@@ -39,6 +40,7 @@ impl FromJObject<f64> for JObject<'_> {
} }
} }
#[allow(dead_code)]
pub trait FromJString { pub trait FromJString {
fn extract(&self, env: &mut JNIEnv) -> Result<String>; fn extract(&self, env: &mut JNIEnv) -> Result<String>;
} }
@@ -66,6 +68,7 @@ pub trait JMapExt {
fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>>; fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>>;
} }
#[allow(dead_code)]
fn get_map_value<T>(env: &mut JNIEnv, map: &JMap, key: &str) -> Result<Option<T>> fn get_map_value<T>(env: &mut JNIEnv, map: &JMap, key: &str) -> Result<Option<T>>
where where
for<'a> JObject<'a>: FromJObject<T>, for<'a> JObject<'a>: FromJObject<T>,

View File

@@ -8,7 +8,7 @@
<parent> <parent>
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.0</version> <version>0.22.1-beta.1</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@@ -8,7 +8,7 @@
<parent> <parent>
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.0</version> <version>0.22.1-beta.1</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.22.1-beta.0</version> <version>0.22.1-beta.1</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>${project.artifactId}</name> <name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description> <description>LanceDB Java SDK Parent POM</description>

View File

@@ -1,7 +1,7 @@
[package] [package]
name = "lancedb-nodejs" name = "lancedb-nodejs"
edition.workspace = true edition.workspace = true
version = "0.22.1-beta.0" version = "0.22.1-beta.1"
license.workspace = true license.workspace = true
description.workspace = true description.workspace = true
repository.workspace = true repository.workspace = true

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-arm64", "name": "@lancedb/lancedb-darwin-arm64",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node", "main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-x64", "name": "@lancedb/lancedb-darwin-x64",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.darwin-x64.node", "main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-gnu", "name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node", "main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-musl", "name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node", "main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-gnu", "name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node", "main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-musl", "name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node", "main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-arm64-msvc", "name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": [ "os": [
"win32" "win32"
], ],

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-x64-msvc", "name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"os": ["win32"], "os": ["win32"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node", "main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{ {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"

View File

@@ -11,7 +11,7 @@
"ann" "ann"
], ],
"private": false, "private": false,
"version": "0.22.1-beta.0", "version": "0.22.1-beta.1",
"main": "dist/index.js", "main": "dist/index.js",
"exports": { "exports": {
".": "./dist/index.js", ".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.25.1-beta.1" current_version = "0.25.1-beta.2"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-python" name = "lancedb-python"
version = "0.25.1-beta.1" version = "0.25.1-beta.2"
edition.workspace = true edition.workspace = true
description = "Python bindings for LanceDB" description = "Python bindings for LanceDB"
license.workspace = true license.workspace = true

View File

@@ -122,7 +122,7 @@ class EmbeddingFunctionRegistry:
obj["vector_column"]: EmbeddingFunctionConfig( obj["vector_column"]: EmbeddingFunctionConfig(
vector_column=obj["vector_column"], vector_column=obj["vector_column"],
source_column=obj["source_column"], source_column=obj["source_column"],
function=self.get(obj["name"])(**obj["model"]), function=self.get(obj["name"]).create(**obj["model"]),
) )
for obj in raw_list for obj in raw_list
} }

View File

@@ -251,6 +251,13 @@ class HnswPq:
results. In most cases, there is no benefit to setting this higher than 500. results. In most cases, there is no benefit to setting this higher than 500.
This value should be set to a value that is not less than `ef` in the This value should be set to a value that is not less than `ef` in the
search phase. search phase.
target_partition_size, default is 1,048,576
The target size of each partition.
This value controls the tradeoff between search performance and accuracy.
faster search but less accurate results as higher value.
""" """
distance_type: Literal["l2", "cosine", "dot"] = "l2" distance_type: Literal["l2", "cosine", "dot"] = "l2"
@@ -261,6 +268,7 @@ class HnswPq:
sample_rate: int = 256 sample_rate: int = 256
m: int = 20 m: int = 20
ef_construction: int = 300 ef_construction: int = 300
target_partition_size: Optional[int] = None
@dataclass @dataclass
@@ -351,6 +359,12 @@ class HnswSq:
This value should be set to a value that is not less than `ef` in the search This value should be set to a value that is not less than `ef` in the search
phase. phase.
target_partition_size, default is 1,048,576
The target size of each partition.
This value controls the tradeoff between search performance and accuracy.
faster search but less accurate results as higher value.
""" """
distance_type: Literal["l2", "cosine", "dot"] = "l2" distance_type: Literal["l2", "cosine", "dot"] = "l2"
@@ -359,6 +373,7 @@ class HnswSq:
sample_rate: int = 256 sample_rate: int = 256
m: int = 20 m: int = 20
ef_construction: int = 300 ef_construction: int = 300
target_partition_size: Optional[int] = None
@dataclass @dataclass
@@ -444,12 +459,20 @@ class IvfFlat:
cases the default should be sufficient. cases the default should be sufficient.
The default value is 256. The default value is 256.
target_partition_size, default is 8192
The target size of each partition.
This value controls the tradeoff between search performance and accuracy.
faster search but less accurate results as higher value.
""" """
distance_type: Literal["l2", "cosine", "dot", "hamming"] = "l2" distance_type: Literal["l2", "cosine", "dot", "hamming"] = "l2"
num_partitions: Optional[int] = None num_partitions: Optional[int] = None
max_iterations: int = 50 max_iterations: int = 50
sample_rate: int = 256 sample_rate: int = 256
target_partition_size: Optional[int] = None
@dataclass @dataclass
@@ -564,6 +587,13 @@ class IvfPq:
cases the default should be sufficient. cases the default should be sufficient.
The default value is 256. The default value is 256.
target_partition_size, default is 8192
The target size of each partition.
This value controls the tradeoff between search performance and accuracy.
faster search but less accurate results as higher value.
""" """
distance_type: Literal["l2", "cosine", "dot"] = "l2" distance_type: Literal["l2", "cosine", "dot"] = "l2"
@@ -572,6 +602,7 @@ class IvfPq:
num_bits: int = 8 num_bits: int = 8
max_iterations: int = 50 max_iterations: int = 50
sample_rate: int = 256 sample_rate: int = 256
target_partition_size: Optional[int] = None
__all__ = [ __all__ = [

View File

@@ -691,6 +691,7 @@ class Table(ABC):
ef_construction: int = 300, ef_construction: int = 300,
name: Optional[str] = None, name: Optional[str] = None,
train: bool = True, train: bool = True,
target_partition_size: Optional[int] = None,
): ):
"""Create an index on the table. """Create an index on the table.
@@ -2002,6 +2003,7 @@ class LanceTable(Table):
*, *,
name: Optional[str] = None, name: Optional[str] = None,
train: bool = True, train: bool = True,
target_partition_size: Optional[int] = None,
): ):
"""Create an index on the table.""" """Create an index on the table."""
if accelerator is not None: if accelerator is not None:
@@ -2018,6 +2020,7 @@ class LanceTable(Table):
num_bits=num_bits, num_bits=num_bits,
m=m, m=m,
ef_construction=ef_construction, ef_construction=ef_construction,
target_partition_size=target_partition_size,
) )
self.checkout_latest() self.checkout_latest()
return return
@@ -2027,6 +2030,7 @@ class LanceTable(Table):
num_partitions=num_partitions, num_partitions=num_partitions,
max_iterations=max_iterations, max_iterations=max_iterations,
sample_rate=sample_rate, sample_rate=sample_rate,
target_partition_size=target_partition_size,
) )
elif index_type == "IVF_PQ": elif index_type == "IVF_PQ":
config = IvfPq( config = IvfPq(
@@ -2036,6 +2040,7 @@ class LanceTable(Table):
num_bits=num_bits, num_bits=num_bits,
max_iterations=max_iterations, max_iterations=max_iterations,
sample_rate=sample_rate, sample_rate=sample_rate,
target_partition_size=target_partition_size,
) )
elif index_type == "IVF_HNSW_PQ": elif index_type == "IVF_HNSW_PQ":
config = HnswPq( config = HnswPq(
@@ -2047,6 +2052,7 @@ class LanceTable(Table):
sample_rate=sample_rate, sample_rate=sample_rate,
m=m, m=m,
ef_construction=ef_construction, ef_construction=ef_construction,
target_partition_size=target_partition_size,
) )
elif index_type == "IVF_HNSW_SQ": elif index_type == "IVF_HNSW_SQ":
config = HnswSq( config = HnswSq(
@@ -2056,6 +2062,7 @@ class LanceTable(Table):
sample_rate=sample_rate, sample_rate=sample_rate,
m=m, m=m,
ef_construction=ef_construction, ef_construction=ef_construction,
target_partition_size=target_partition_size,
) )
else: else:
raise ValueError(f"Unknown index type {index_type}") raise ValueError(f"Unknown index type {index_type}")

View File

@@ -114,6 +114,63 @@ def test_embedding_function_variables():
assert func.safe_model_dump()["secret_key"] == "$var:secret" assert func.safe_model_dump()["secret_key"] == "$var:secret"
def test_parse_functions_with_variables():
@register("variable-parsing-test")
class VariableParsingFunction(TextEmbeddingFunction):
api_key: str
base_url: Optional[str] = None
@staticmethod
def sensitive_keys():
return ["api_key"]
def ndims(self):
return 10
def generate_embeddings(self, texts):
# Mock implementation that just returns random embeddings
# In real usage, this would use the api_key to call an API
return [np.random.rand(self.ndims()).tolist() for _ in texts]
registry = EmbeddingFunctionRegistry.get_instance()
registry.set_var("test_api_key", "sk-test-key-12345")
registry.set_var("test_base_url", "https://api.example.com")
conf = EmbeddingFunctionConfig(
source_column="text",
vector_column="vector",
function=registry.get("variable-parsing-test").create(
api_key="$var:test_api_key", base_url="$var:test_base_url"
),
)
metadata = registry.get_table_metadata([conf])
# Create a mock arrow table with the metadata
schema = pa.schema(
[pa.field("text", pa.string()), pa.field("vector", pa.list_(pa.float32(), 10))]
)
table = pa.table({"text": [], "vector": []}, schema=schema)
table = table.replace_schema_metadata(metadata)
ds = lance.write_dataset(table, "memory://")
configs = registry.parse_functions(ds.schema.metadata)
assert "vector" in configs
parsed_func = configs["vector"].function
assert parsed_func.api_key == "sk-test-key-12345"
assert parsed_func.base_url == "https://api.example.com"
embeddings = parsed_func.generate_embeddings(["test text"])
assert len(embeddings) == 1
assert len(embeddings[0]) == 10
assert parsed_func.safe_model_dump()["api_key"] == "$var:test_api_key"
def test_embedding_with_bad_results(tmp_path): def test_embedding_with_bad_results(tmp_path):
@register("null-embedding") @register("null-embedding")
class NullEmbeddingFunction(TextEmbeddingFunction): class NullEmbeddingFunction(TextEmbeddingFunction):

View File

@@ -674,6 +674,45 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
"vector", replace=True, config=expected_config, name=None, train=True "vector", replace=True, config=expected_config, name=None, train=True
) )
# Test with target_partition_size
table.create_index(
metric="l2",
num_sub_vectors=96,
vector_column_name="vector",
replace=True,
index_cache_size=256,
num_bits=4,
target_partition_size=8192,
)
expected_config = IvfPq(
distance_type="l2",
num_sub_vectors=96,
num_bits=4,
target_partition_size=8192,
)
mock_create_index.assert_called_with(
"vector", replace=True, config=expected_config, name=None, train=True
)
# target_partition_size has a default value,
# so `num_partitions` and `target_partition_size` are not required
table.create_index(
metric="l2",
num_sub_vectors=96,
vector_column_name="vector",
replace=True,
index_cache_size=256,
num_bits=4,
)
expected_config = IvfPq(
distance_type="l2",
num_sub_vectors=96,
num_bits=4,
)
mock_create_index.assert_called_with(
"vector", replace=True, config=expected_config, name=None, train=True
)
table.create_index( table.create_index(
vector_column_name="my_vector", vector_column_name="my_vector",
metric="dot", metric="dot",

View File

@@ -255,7 +255,7 @@ impl Connection {
#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))] #[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None, session=None))]
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn connect( pub fn connect(
py: Python, py: Python<'_>,
uri: String, uri: String,
api_key: Option<String>, api_key: Option<String>,
region: Option<String>, region: Option<String>,

View File

@@ -63,6 +63,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
if let Some(num_partitions) = params.num_partitions { if let Some(num_partitions) = params.num_partitions {
ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions); ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
} }
if let Some(target_partition_size) = params.target_partition_size {
ivf_flat_builder = ivf_flat_builder.target_partition_size(target_partition_size);
}
Ok(LanceDbIndex::IvfFlat(ivf_flat_builder)) Ok(LanceDbIndex::IvfFlat(ivf_flat_builder))
}, },
"IvfPq" => { "IvfPq" => {
@@ -76,6 +79,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
if let Some(num_partitions) = params.num_partitions { if let Some(num_partitions) = params.num_partitions {
ivf_pq_builder = ivf_pq_builder.num_partitions(num_partitions); ivf_pq_builder = ivf_pq_builder.num_partitions(num_partitions);
} }
if let Some(target_partition_size) = params.target_partition_size {
ivf_pq_builder = ivf_pq_builder.target_partition_size(target_partition_size);
}
if let Some(num_sub_vectors) = params.num_sub_vectors { if let Some(num_sub_vectors) = params.num_sub_vectors {
ivf_pq_builder = ivf_pq_builder.num_sub_vectors(num_sub_vectors); ivf_pq_builder = ivf_pq_builder.num_sub_vectors(num_sub_vectors);
} }
@@ -94,6 +100,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
if let Some(num_partitions) = params.num_partitions { if let Some(num_partitions) = params.num_partitions {
hnsw_pq_builder = hnsw_pq_builder.num_partitions(num_partitions); hnsw_pq_builder = hnsw_pq_builder.num_partitions(num_partitions);
} }
if let Some(target_partition_size) = params.target_partition_size {
hnsw_pq_builder = hnsw_pq_builder.target_partition_size(target_partition_size);
}
if let Some(num_sub_vectors) = params.num_sub_vectors { if let Some(num_sub_vectors) = params.num_sub_vectors {
hnsw_pq_builder = hnsw_pq_builder.num_sub_vectors(num_sub_vectors); hnsw_pq_builder = hnsw_pq_builder.num_sub_vectors(num_sub_vectors);
} }
@@ -111,6 +120,9 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
if let Some(num_partitions) = params.num_partitions { if let Some(num_partitions) = params.num_partitions {
hnsw_sq_builder = hnsw_sq_builder.num_partitions(num_partitions); hnsw_sq_builder = hnsw_sq_builder.num_partitions(num_partitions);
} }
if let Some(target_partition_size) = params.target_partition_size {
hnsw_sq_builder = hnsw_sq_builder.target_partition_size(target_partition_size);
}
Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder)) Ok(LanceDbIndex::IvfHnswSq(hnsw_sq_builder))
}, },
not_supported => Err(PyValueError::new_err(format!( not_supported => Err(PyValueError::new_err(format!(
@@ -144,6 +156,7 @@ struct IvfFlatParams {
num_partitions: Option<u32>, num_partitions: Option<u32>,
max_iterations: u32, max_iterations: u32,
sample_rate: u32, sample_rate: u32,
target_partition_size: Option<u32>,
} }
#[derive(FromPyObject)] #[derive(FromPyObject)]
@@ -154,6 +167,7 @@ struct IvfPqParams {
num_bits: u32, num_bits: u32,
max_iterations: u32, max_iterations: u32,
sample_rate: u32, sample_rate: u32,
target_partition_size: Option<u32>,
} }
#[derive(FromPyObject)] #[derive(FromPyObject)]
@@ -166,6 +180,7 @@ struct IvfHnswPqParams {
sample_rate: u32, sample_rate: u32,
m: u32, m: u32,
ef_construction: u32, ef_construction: u32,
target_partition_size: Option<u32>,
} }
#[derive(FromPyObject)] #[derive(FromPyObject)]
@@ -176,6 +191,7 @@ struct IvfHnswSqParams {
sample_rate: u32, sample_rate: u32,
m: u32, m: u32,
ef_construction: u32, ef_construction: u32,
target_partition_size: Option<u32>,
} }
#[pyclass(get_all)] #[pyclass(get_all)]

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb" name = "lancedb"
version = "0.22.1-beta.0" version = "0.22.1-beta.1"
edition.workspace = true edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true
@@ -86,11 +86,11 @@ rand = { version = "0.9", features = ["small_rng"] }
random_word = { version = "0.4.3", features = ["en"] } random_word = { version = "0.4.3", features = ["en"] }
uuid = { version = "1.7.0", features = ["v4"] } uuid = { version = "1.7.0", features = ["v4"] }
walkdir = "2" walkdir = "2"
aws-sdk-dynamodb = { version = "1.38.0" } aws-sdk-dynamodb = { version = "1.55.0" }
aws-sdk-s3 = { version = "1.38.0" } aws-sdk-s3 = { version = "1.55.0" }
aws-sdk-kms = { version = "1.37" } aws-sdk-kms = { version = "1.48.0" }
aws-config = { version = "1.0" } aws-config = { version = "1.5.10" }
aws-smithy-runtime = { version = "1.3" } aws-smithy-runtime = { version = "1.9.1" }
datafusion.workspace = true datafusion.workspace = true
http-body = "1" # Matching reqwest http-body = "1" # Matching reqwest
rstest = "0.23.0" rstest = "0.23.0"

View File

@@ -1,86 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Catalog implementation for managing databases
pub mod listing;
use std::collections::HashMap;
use std::sync::Arc;
use crate::database::Database;
use crate::error::Result;
use async_trait::async_trait;
pub trait CatalogOptions {
fn serialize_into_map(&self, map: &mut HashMap<String, String>);
}
/// Request parameters for listing databases
#[derive(Clone, Debug, Default)]
pub struct DatabaseNamesRequest {
/// Start listing after this name (exclusive)
pub start_after: Option<String>,
/// Maximum number of names to return
pub limit: Option<u32>,
}
/// Request to open an existing database
#[derive(Clone, Debug)]
pub struct OpenDatabaseRequest {
/// The name of the database to open
pub name: String,
/// A map of database-specific options
///
/// Consult the catalog / database implementation to determine which options are available
pub database_options: HashMap<String, String>,
}
/// Database creation mode
///
/// The default behavior is Create
pub enum CreateDatabaseMode {
/// Create new database, error if exists
Create,
/// Open existing database if present
ExistOk,
/// Overwrite existing database
Overwrite,
}
impl Default for CreateDatabaseMode {
fn default() -> Self {
Self::Create
}
}
/// Request to create a new database
pub struct CreateDatabaseRequest {
/// The name of the database to create
pub name: String,
/// The creation mode
pub mode: CreateDatabaseMode,
/// A map of catalog-specific options, consult your catalog implementation to determine what's available
pub options: HashMap<String, String>,
}
#[async_trait]
pub trait Catalog: Send + Sync + std::fmt::Debug + 'static {
/// List database names with pagination
async fn database_names(&self, request: DatabaseNamesRequest) -> Result<Vec<String>>;
/// Create a new database
async fn create_database(&self, request: CreateDatabaseRequest) -> Result<Arc<dyn Database>>;
/// Open existing database
async fn open_database(&self, request: OpenDatabaseRequest) -> Result<Arc<dyn Database>>;
/// Rename database
async fn rename_database(&self, old_name: &str, new_name: &str) -> Result<()>;
/// Delete database
async fn drop_database(&self, name: &str) -> Result<()>;
/// Delete all databases
async fn drop_all_databases(&self) -> Result<()>;
}

View File

@@ -1,624 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Catalog implementation based on a local file system.
use std::collections::HashMap;
use std::fs::create_dir_all;
use std::path::Path;
use std::sync::Arc;
use super::{
Catalog, CatalogOptions, CreateDatabaseMode, CreateDatabaseRequest, DatabaseNamesRequest,
OpenDatabaseRequest,
};
use crate::connection::ConnectRequest;
use crate::database::listing::{ListingDatabase, ListingDatabaseOptions};
use crate::database::{Database, DatabaseOptions};
use crate::error::{CreateDirSnafu, Error, Result};
use async_trait::async_trait;
use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry};
use lance_io::local::to_local_path;
use object_store::path::Path as ObjectStorePath;
use snafu::ResultExt;
/// Options for the listing catalog
///
/// Note: the catalog will use the `storage_options` configured on
/// db_options to configure storage for listing / creating / deleting
/// databases.
#[derive(Clone, Debug, Default)]
pub struct ListingCatalogOptions {
/// The options to use for databases opened by this catalog
///
/// This also contains the storage options used by the catalog
pub db_options: ListingDatabaseOptions,
}
impl CatalogOptions for ListingCatalogOptions {
fn serialize_into_map(&self, map: &mut HashMap<String, String>) {
self.db_options.serialize_into_map(map);
}
}
impl ListingCatalogOptions {
pub fn builder() -> ListingCatalogOptionsBuilder {
ListingCatalogOptionsBuilder::new()
}
pub(crate) fn parse_from_map(map: &HashMap<String, String>) -> Result<Self> {
let db_options = ListingDatabaseOptions::parse_from_map(map)?;
Ok(Self { db_options })
}
}
#[derive(Clone, Debug, Default)]
pub struct ListingCatalogOptionsBuilder {
options: ListingCatalogOptions,
}
impl ListingCatalogOptionsBuilder {
pub fn new() -> Self {
Self {
options: ListingCatalogOptions::default(),
}
}
pub fn db_options(mut self, db_options: ListingDatabaseOptions) -> Self {
self.options.db_options = db_options;
self
}
pub fn build(self) -> ListingCatalogOptions {
self.options
}
}
/// A catalog implementation that works by listing subfolders in a directory
///
/// The listing catalog will be created with a base folder specified by the URI. Every subfolder
/// in this base folder will be considered a database. These will be opened as a
/// [`crate::database::listing::ListingDatabase`]
#[derive(Debug)]
pub struct ListingCatalog {
object_store: Arc<ObjectStore>,
uri: String,
base_path: ObjectStorePath,
options: ListingCatalogOptions,
}
impl ListingCatalog {
/// Try to create a local directory to store the lancedb dataset
pub fn try_create_dir(path: &str) -> core::result::Result<(), std::io::Error> {
let path = Path::new(path);
if !path.try_exists()? {
create_dir_all(path)?;
}
Ok(())
}
pub fn uri(&self) -> &str {
&self.uri
}
async fn open_path(path: &str) -> Result<Self> {
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
if object_store.is_local() {
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
}
Ok(Self {
uri: path.to_string(),
base_path,
object_store,
options: ListingCatalogOptions::default(),
})
}
pub async fn connect(request: &ConnectRequest) -> Result<Self> {
let uri = &request.uri;
let parse_res = url::Url::parse(uri);
let options = ListingCatalogOptions::parse_from_map(&request.options)?;
match parse_res {
Ok(url) if url.scheme().len() == 1 && cfg!(windows) => Self::open_path(uri).await,
Ok(url) => {
let plain_uri = url.to_string();
let registry = Arc::new(ObjectStoreRegistry::default());
let storage_options = options.db_options.storage_options.clone();
let os_params = ObjectStoreParams {
storage_options: Some(storage_options.clone()),
..Default::default()
};
let (object_store, base_path) =
ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
if object_store.is_local() {
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
}
Ok(Self {
uri: String::from(url.clone()),
base_path,
object_store,
options,
})
}
Err(_) => Self::open_path(uri).await,
}
}
fn database_path(&self, name: &str) -> ObjectStorePath {
self.base_path.child(name.replace('\\', "/"))
}
}
#[async_trait]
impl Catalog for ListingCatalog {
async fn database_names(&self, request: DatabaseNamesRequest) -> Result<Vec<String>> {
let mut f = self
.object_store
.read_dir(self.base_path.clone())
.await?
.iter()
.map(Path::new)
.filter_map(|p| p.file_name().and_then(|s| s.to_str().map(String::from)))
.collect::<Vec<String>>();
f.sort();
if let Some(start_after) = request.start_after {
let index = f
.iter()
.position(|name| name.as_str() > start_after.as_str())
.unwrap_or(f.len());
f.drain(0..index);
}
if let Some(limit) = request.limit {
f.truncate(limit as usize);
}
Ok(f)
}
async fn create_database(&self, request: CreateDatabaseRequest) -> Result<Arc<dyn Database>> {
let db_path = self.database_path(&request.name);
let db_path_str = to_local_path(&db_path);
let exists = Path::new(&db_path_str).exists();
match request.mode {
CreateDatabaseMode::Create if exists => {
return Err(Error::DatabaseAlreadyExists { name: request.name })
}
CreateDatabaseMode::Create => {
create_dir_all(db_path.to_string()).unwrap();
}
CreateDatabaseMode::ExistOk => {
if !exists {
create_dir_all(db_path.to_string()).unwrap();
}
}
CreateDatabaseMode::Overwrite => {
if exists {
self.drop_database(&request.name).await?;
}
create_dir_all(db_path.to_string()).unwrap();
}
}
let db_uri = format!("/{}/{}", self.base_path, request.name);
let mut connect_request = ConnectRequest {
uri: db_uri,
#[cfg(feature = "remote")]
client_config: Default::default(),
read_consistency_interval: None,
options: Default::default(),
session: None,
};
// Add the db options to the connect request
self.options
.db_options
.serialize_into_map(&mut connect_request.options);
Ok(Arc::new(
ListingDatabase::connect_with_options(&connect_request).await?,
))
}
async fn open_database(&self, request: OpenDatabaseRequest) -> Result<Arc<dyn Database>> {
let db_path = self.database_path(&request.name);
let db_path_str = to_local_path(&db_path);
let exists = Path::new(&db_path_str).exists();
if !exists {
return Err(Error::DatabaseNotFound { name: request.name });
}
let mut connect_request = ConnectRequest {
uri: db_path.to_string(),
#[cfg(feature = "remote")]
client_config: Default::default(),
read_consistency_interval: None,
options: Default::default(),
session: None,
};
// Add the db options to the connect request
self.options
.db_options
.serialize_into_map(&mut connect_request.options);
Ok(Arc::new(
ListingDatabase::connect_with_options(&connect_request).await?,
))
}
async fn rename_database(&self, _old_name: &str, _new_name: &str) -> Result<()> {
Err(Error::NotSupported {
message: "rename_database is not supported in LanceDB OSS yet".to_string(),
})
}
async fn drop_database(&self, name: &str) -> Result<()> {
let db_path = self.database_path(name);
self.object_store
.remove_dir_all(db_path.clone())
.await
.map_err(|err| match err {
lance::Error::NotFound { .. } => Error::DatabaseNotFound {
name: name.to_owned(),
},
_ => Error::from(err),
})?;
Ok(())
}
async fn drop_all_databases(&self) -> Result<()> {
self.object_store
.remove_dir_all(self.base_path.clone())
.await?;
Ok(())
}
}
#[cfg(all(test, not(windows)))]
mod tests {
use super::*;
/// file:/// URIs with drive letters do not work correctly on Windows
#[cfg(windows)]
fn path_to_uri(path: PathBuf) -> String {
path.to_str().unwrap().to_string()
}
#[cfg(not(windows))]
fn path_to_uri(path: PathBuf) -> String {
Url::from_file_path(path).unwrap().to_string()
}
async fn setup_catalog() -> (TempDir, ListingCatalog) {
let tempdir = tempfile::tempdir().unwrap();
let catalog_path = tempdir.path().join("catalog");
std::fs::create_dir_all(&catalog_path).unwrap();
let uri = path_to_uri(catalog_path);
let request = ConnectRequest {
uri: uri.clone(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
read_consistency_interval: None,
session: None,
};
let catalog = ListingCatalog::connect(&request).await.unwrap();
(tempdir, catalog)
}
use crate::database::{CreateTableData, CreateTableRequest, TableNamesRequest};
use crate::table::TableDefinition;
use arrow_schema::Field;
use std::path::PathBuf;
use std::sync::Arc;
use tempfile::{tempdir, TempDir};
use url::Url;
#[tokio::test]
async fn test_database_names() {
let (_tempdir, catalog) = setup_catalog().await;
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert!(names.is_empty());
}
#[tokio::test]
async fn test_create_database() {
let (_tempdir, catalog) = setup_catalog().await;
catalog
.create_database(CreateDatabaseRequest {
name: "db1".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert_eq!(names, vec!["db1"]);
}
#[tokio::test]
async fn test_create_database_exist_ok() {
let (_tempdir, catalog) = setup_catalog().await;
let db1 = catalog
.create_database(CreateDatabaseRequest {
name: "db_exist_ok".into(),
mode: CreateDatabaseMode::ExistOk,
options: HashMap::new(),
})
.await
.unwrap();
let dummy_schema = Arc::new(arrow_schema::Schema::new(Vec::<Field>::default()));
db1.create_table(CreateTableRequest {
name: "test_table".parse().unwrap(),
data: CreateTableData::Empty(TableDefinition::new_from_schema(dummy_schema)),
mode: Default::default(),
write_options: Default::default(),
namespace: vec![],
})
.await
.unwrap();
let db2 = catalog
.create_database(CreateDatabaseRequest {
name: "db_exist_ok".into(),
mode: CreateDatabaseMode::ExistOk,
options: HashMap::new(),
})
.await
.unwrap();
let tables = db2.table_names(TableNamesRequest::default()).await.unwrap();
assert_eq!(tables, vec!["test_table".to_string()]);
}
#[tokio::test]
async fn test_create_database_overwrite() {
let (_tempdir, catalog) = setup_catalog().await;
let db = catalog
.create_database(CreateDatabaseRequest {
name: "db_overwrite".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
let dummy_schema = Arc::new(arrow_schema::Schema::new(Vec::<Field>::default()));
db.create_table(CreateTableRequest {
name: "old_table".parse().unwrap(),
data: CreateTableData::Empty(TableDefinition::new_from_schema(dummy_schema)),
mode: Default::default(),
write_options: Default::default(),
namespace: vec![],
})
.await
.unwrap();
let tables = db.table_names(TableNamesRequest::default()).await.unwrap();
assert!(!tables.is_empty());
let new_db = catalog
.create_database(CreateDatabaseRequest {
name: "db_overwrite".into(),
mode: CreateDatabaseMode::Overwrite,
options: HashMap::new(),
})
.await
.unwrap();
let tables = new_db
.table_names(TableNamesRequest::default())
.await
.unwrap();
assert!(tables.is_empty());
}
#[tokio::test]
async fn test_create_database_overwrite_non_existing() {
let (_tempdir, catalog) = setup_catalog().await;
catalog
.create_database(CreateDatabaseRequest {
name: "new_db".into(),
mode: CreateDatabaseMode::Overwrite,
options: HashMap::new(),
})
.await
.unwrap();
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert!(names.contains(&"new_db".to_string()));
}
#[tokio::test]
async fn test_open_database() {
let (_tempdir, catalog) = setup_catalog().await;
// Test open non-existent
let result = catalog
.open_database(OpenDatabaseRequest {
name: "missing".into(),
database_options: HashMap::new(),
})
.await;
assert!(matches!(
result.unwrap_err(),
Error::DatabaseNotFound { name } if name == "missing"
));
// Create and open
catalog
.create_database(CreateDatabaseRequest {
name: "valid_db".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
let db = catalog
.open_database(OpenDatabaseRequest {
name: "valid_db".into(),
database_options: HashMap::new(),
})
.await
.unwrap();
assert_eq!(
db.table_names(TableNamesRequest::default()).await.unwrap(),
Vec::<String>::new()
);
}
#[tokio::test]
async fn test_drop_database() {
let (_tempdir, catalog) = setup_catalog().await;
// Create test database
catalog
.create_database(CreateDatabaseRequest {
name: "to_drop".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert!(!names.is_empty());
// Drop database
catalog.drop_database("to_drop").await.unwrap();
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert!(names.is_empty());
}
#[tokio::test]
async fn test_drop_all_databases() {
let (_tempdir, catalog) = setup_catalog().await;
catalog
.create_database(CreateDatabaseRequest {
name: "db1".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
catalog
.create_database(CreateDatabaseRequest {
name: "db2".into(),
mode: CreateDatabaseMode::Create,
options: HashMap::new(),
})
.await
.unwrap();
catalog.drop_all_databases().await.unwrap();
let names = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert!(names.is_empty());
}
#[tokio::test]
async fn test_rename_database_unsupported() {
let (_tempdir, catalog) = setup_catalog().await;
let result = catalog.rename_database("old", "new").await;
assert!(matches!(
result.unwrap_err(),
Error::NotSupported { message } if message.contains("rename_database")
));
}
#[tokio::test]
async fn test_connect_local_path() {
let tmp_dir = tempdir().unwrap();
let path = tmp_dir.path().to_str().unwrap();
let request = ConnectRequest {
uri: path.to_string(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
read_consistency_interval: None,
session: None,
};
let catalog = ListingCatalog::connect(&request).await.unwrap();
assert!(catalog.object_store.is_local());
assert_eq!(catalog.uri, path);
}
#[tokio::test]
async fn test_connect_file_scheme() {
let tmp_dir = tempdir().unwrap();
let path = tmp_dir.path();
let uri = path_to_uri(path.to_path_buf());
let request = ConnectRequest {
uri: uri.clone(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
read_consistency_interval: None,
session: None,
};
let catalog = ListingCatalog::connect(&request).await.unwrap();
assert!(catalog.object_store.is_local());
assert_eq!(catalog.uri, uri);
}
#[tokio::test]
async fn test_connect_invalid_uri_fallback() {
let invalid_uri = "invalid:///path";
let request = ConnectRequest {
uri: invalid_uri.to_string(),
#[cfg(feature = "remote")]
client_config: Default::default(),
options: Default::default(),
read_consistency_interval: None,
session: None,
};
let result = ListingCatalog::connect(&request).await;
assert!(result.is_err());
}
}

View File

@@ -13,8 +13,6 @@ use lance::dataset::ReadParams;
use object_store::aws::AwsCredential; use object_store::aws::AwsCredential;
use crate::arrow::{IntoArrow, IntoArrowStream, SendableRecordBatchStream}; use crate::arrow::{IntoArrow, IntoArrowStream, SendableRecordBatchStream};
use crate::catalog::listing::ListingCatalog;
use crate::catalog::CatalogOptions;
use crate::database::listing::{ use crate::database::listing::{
ListingDatabase, OPT_NEW_TABLE_STORAGE_VERSION, OPT_NEW_TABLE_V2_MANIFEST_PATHS, ListingDatabase, OPT_NEW_TABLE_STORAGE_VERSION, OPT_NEW_TABLE_V2_MANIFEST_PATHS,
}; };
@@ -660,7 +658,7 @@ pub struct ConnectRequest {
#[cfg(feature = "remote")] #[cfg(feature = "remote")]
pub client_config: ClientConfig, pub client_config: ClientConfig,
/// Database/Catalog specific options /// Database specific options
pub options: HashMap<String, String>, pub options: HashMap<String, String>,
/// The interval at which to check for updates from other processes. /// The interval at which to check for updates from other processes.
@@ -937,50 +935,6 @@ pub fn connect(uri: &str) -> ConnectBuilder {
ConnectBuilder::new(uri) ConnectBuilder::new(uri)
} }
/// A builder for configuring a connection to a LanceDB catalog
#[derive(Debug)]
pub struct CatalogConnectBuilder {
request: ConnectRequest,
}
impl CatalogConnectBuilder {
/// Create a new [`CatalogConnectBuilder`] with the given catalog URI.
pub fn new(uri: &str) -> Self {
Self {
request: ConnectRequest {
uri: uri.to_string(),
#[cfg(feature = "remote")]
client_config: Default::default(),
read_consistency_interval: None,
options: HashMap::new(),
session: None,
},
}
}
pub fn catalog_options(mut self, catalog_options: &dyn CatalogOptions) -> Self {
catalog_options.serialize_into_map(&mut self.request.options);
self
}
/// Establishes a connection to the catalog
pub async fn execute(self) -> Result<Arc<ListingCatalog>> {
let catalog = ListingCatalog::connect(&self.request).await?;
Ok(Arc::new(catalog))
}
}
/// Connect to a LanceDB catalog.
///
/// A catalog is a container for databases, which in turn are containers for tables.
///
/// # Arguments
///
/// * `uri` - URI where the catalog is located, can be a local directory or supported remote cloud storage.
pub fn connect_catalog(uri: &str) -> CatalogConnectBuilder {
CatalogConnectBuilder::new(uri)
}
#[cfg(all(test, feature = "remote"))] #[cfg(all(test, feature = "remote"))]
mod test_utils { mod test_utils {
use super::*; use super::*;
@@ -1022,7 +976,6 @@ mod test_utils {
mod tests { mod tests {
use std::fs::create_dir_all; use std::fs::create_dir_all;
use crate::catalog::{Catalog, DatabaseNamesRequest, OpenDatabaseRequest};
use crate::database::listing::{ListingDatabaseOptions, NewTableConfig}; use crate::database::listing::{ListingDatabaseOptions, NewTableConfig};
use crate::query::QueryBase; use crate::query::QueryBase;
use crate::query::{ExecutableQuery, QueryExecutionOptions}; use crate::query::{ExecutableQuery, QueryExecutionOptions};
@@ -1328,91 +1281,4 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(other_schema, overwritten.schema().await.unwrap()); assert_eq!(other_schema, overwritten.schema().await.unwrap());
} }
#[tokio::test]
async fn test_connect_catalog() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let catalog = connect_catalog(uri).execute().await.unwrap();
// Verify that we can get the uri from the catalog
let catalog_uri = catalog.uri();
assert_eq!(catalog_uri, uri);
// Check that the catalog is initially empty
let dbs = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert_eq!(dbs.len(), 0);
}
#[tokio::test]
#[cfg(not(windows))]
async fn test_catalog_create_database() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let catalog = connect_catalog(uri).execute().await.unwrap();
let db_name = "test_db";
catalog
.create_database(crate::catalog::CreateDatabaseRequest {
name: db_name.to_string(),
mode: Default::default(),
options: Default::default(),
})
.await
.unwrap();
let dbs = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert_eq!(dbs.len(), 1);
assert_eq!(dbs[0], db_name);
let db = catalog
.open_database(OpenDatabaseRequest {
name: db_name.to_string(),
database_options: HashMap::new(),
})
.await
.unwrap();
let tables = db.table_names(Default::default()).await.unwrap();
assert_eq!(tables.len(), 0);
}
#[tokio::test]
#[cfg(not(windows))]
async fn test_catalog_drop_database() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let catalog = connect_catalog(uri).execute().await.unwrap();
// Create and then drop a database
let db_name = "test_db_to_drop";
catalog
.create_database(crate::catalog::CreateDatabaseRequest {
name: db_name.to_string(),
mode: Default::default(),
options: Default::default(),
})
.await
.unwrap();
let dbs = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert_eq!(dbs.len(), 1);
catalog.drop_database(db_name).await.unwrap();
let dbs_after = catalog
.database_names(DatabaseNamesRequest::default())
.await
.unwrap();
assert_eq!(dbs_after.len(), 0);
}
} }

View File

@@ -587,7 +587,13 @@ impl ListingDatabase {
#[async_trait::async_trait] #[async_trait::async_trait]
impl Database for ListingDatabase { impl Database for ListingDatabase {
async fn list_namespaces(&self, _request: ListNamespacesRequest) -> Result<Vec<String>> { async fn list_namespaces(&self, request: ListNamespacesRequest) -> Result<Vec<String>> {
if !request.namespace.is_empty() {
return Err(Error::NotSupported {
message: "Namespace operations are not supported for listing database".into(),
});
}
Ok(Vec::new()) Ok(Vec::new())
} }

View File

@@ -45,10 +45,10 @@ use crate::{
pub trait EmbeddingFunction: std::fmt::Debug + Send + Sync { pub trait EmbeddingFunction: std::fmt::Debug + Send + Sync {
fn name(&self) -> &str; fn name(&self) -> &str;
/// The type of the input data /// The type of the input data
fn source_type(&self) -> Result<Cow<DataType>>; fn source_type(&self) -> Result<Cow<'_, DataType>>;
/// The type of the output data /// The type of the output data
/// This should **always** match the output of the `embed` function /// This should **always** match the output of the `embed` function
fn dest_type(&self) -> Result<Cow<DataType>>; fn dest_type(&self) -> Result<Cow<'_, DataType>>;
/// Compute the embeddings for the source column in the database /// Compute the embeddings for the source column in the database
fn compute_source_embeddings(&self, source: Arc<dyn Array>) -> Result<Arc<dyn Array>>; fn compute_source_embeddings(&self, source: Arc<dyn Array>) -> Result<Arc<dyn Array>>;
/// Compute the embeddings for a given user query /// Compute the embeddings for a given user query

View File

@@ -75,11 +75,11 @@ impl EmbeddingFunction for BedrockEmbeddingFunction {
"bedrock" "bedrock"
} }
fn source_type(&self) -> Result<Cow<DataType>> { fn source_type(&self) -> Result<Cow<'_, DataType>> {
Ok(Cow::Owned(DataType::Utf8)) Ok(Cow::Owned(DataType::Utf8))
} }
fn dest_type(&self) -> Result<Cow<DataType>> { fn dest_type(&self) -> Result<Cow<'_, DataType>> {
let n_dims = self.model.ndims(); let n_dims = self.model.ndims();
Ok(Cow::Owned(DataType::new_fixed_size_list( Ok(Cow::Owned(DataType::new_fixed_size_list(
DataType::Float32, DataType::Float32,

View File

@@ -144,11 +144,11 @@ impl EmbeddingFunction for OpenAIEmbeddingFunction {
"openai" "openai"
} }
fn source_type(&self) -> Result<Cow<DataType>> { fn source_type(&self) -> Result<Cow<'_, DataType>> {
Ok(Cow::Owned(DataType::Utf8)) Ok(Cow::Owned(DataType::Utf8))
} }
fn dest_type(&self) -> Result<Cow<DataType>> { fn dest_type(&self) -> Result<Cow<'_, DataType>> {
let n_dims = self.model.ndims(); let n_dims = self.model.ndims();
Ok(Cow::Owned(DataType::new_fixed_size_list( Ok(Cow::Owned(DataType::new_fixed_size_list(
DataType::Float32, DataType::Float32,

View File

@@ -407,11 +407,11 @@ impl EmbeddingFunction for SentenceTransformersEmbeddings {
"sentence-transformers" "sentence-transformers"
} }
fn source_type(&self) -> crate::Result<std::borrow::Cow<arrow_schema::DataType>> { fn source_type(&self) -> crate::Result<std::borrow::Cow<'_, arrow_schema::DataType>> {
Ok(Cow::Owned(DataType::Utf8)) Ok(Cow::Owned(DataType::Utf8))
} }
fn dest_type(&self) -> crate::Result<std::borrow::Cow<arrow_schema::DataType>> { fn dest_type(&self) -> crate::Result<std::borrow::Cow<'_, arrow_schema::DataType>> {
let (n_dims, dtype) = self.compute_ndims_and_dtype()?; let (n_dims, dtype) = self.compute_ndims_and_dtype()?;
Ok(Cow::Owned(DataType::new_fixed_size_list( Ok(Cow::Owned(DataType::new_fixed_size_list(
dtype, dtype,

View File

@@ -112,6 +112,15 @@ macro_rules! impl_ivf_params_setter {
self.max_iterations = max_iterations; self.max_iterations = max_iterations;
self self
} }
/// The target size of each partition.
///
/// This value controls the tradeoff between search performance and accuracy.
/// The higher the value the faster the search but the less accurate the results will be.
pub fn target_partition_size(mut self, target_partition_size: u32) -> Self {
self.target_partition_size = Some(target_partition_size);
self
}
}; };
} }
@@ -182,6 +191,7 @@ pub struct IvfFlatIndexBuilder {
pub(crate) num_partitions: Option<u32>, pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32, pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32, pub(crate) max_iterations: u32,
pub(crate) target_partition_size: Option<u32>,
} }
impl Default for IvfFlatIndexBuilder { impl Default for IvfFlatIndexBuilder {
@@ -191,6 +201,7 @@ impl Default for IvfFlatIndexBuilder {
num_partitions: None, num_partitions: None,
sample_rate: 256, sample_rate: 256,
max_iterations: 50, max_iterations: 50,
target_partition_size: None,
} }
} }
} }
@@ -228,6 +239,7 @@ pub struct IvfPqIndexBuilder {
pub(crate) num_partitions: Option<u32>, pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32, pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32, pub(crate) max_iterations: u32,
pub(crate) target_partition_size: Option<u32>,
// PQ // PQ
pub(crate) num_sub_vectors: Option<u32>, pub(crate) num_sub_vectors: Option<u32>,
@@ -243,6 +255,7 @@ impl Default for IvfPqIndexBuilder {
num_bits: None, num_bits: None,
sample_rate: 256, sample_rate: 256,
max_iterations: 50, max_iterations: 50,
target_partition_size: None,
} }
} }
} }
@@ -293,6 +306,7 @@ pub struct IvfHnswPqIndexBuilder {
pub(crate) num_partitions: Option<u32>, pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32, pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32, pub(crate) max_iterations: u32,
pub(crate) target_partition_size: Option<u32>,
// HNSW // HNSW
pub(crate) m: u32, pub(crate) m: u32,
@@ -314,6 +328,7 @@ impl Default for IvfHnswPqIndexBuilder {
max_iterations: 50, max_iterations: 50,
m: 20, m: 20,
ef_construction: 300, ef_construction: 300,
target_partition_size: None,
} }
} }
} }
@@ -341,6 +356,7 @@ pub struct IvfHnswSqIndexBuilder {
pub(crate) num_partitions: Option<u32>, pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32, pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32, pub(crate) max_iterations: u32,
pub(crate) target_partition_size: Option<u32>,
// HNSW // HNSW
pub(crate) m: u32, pub(crate) m: u32,
@@ -358,6 +374,7 @@ impl Default for IvfHnswSqIndexBuilder {
max_iterations: 50, max_iterations: 50,
m: 20, m: 20,
ef_construction: 300, ef_construction: 300,
target_partition_size: None,
} }
} }
} }

View File

@@ -191,7 +191,6 @@
//! ``` //! ```
pub mod arrow; pub mod arrow;
pub mod catalog;
pub mod connection; pub mod connection;
pub mod data; pub mod data;
pub mod database; pub mod database;

View File

@@ -242,17 +242,15 @@ pub struct OptimizeStats {
/// Describes what happens when a vector either contains NaN or /// Describes what happens when a vector either contains NaN or
/// does not have enough values /// does not have enough values
#[derive(Clone, Debug, Default)] #[derive(Clone, Debug, Default)]
#[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992
enum BadVectorHandling { enum BadVectorHandling {
/// An error is returned /// An error is returned
#[default] #[default]
Error, Error,
#[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992
/// The offending row is droppped /// The offending row is droppped
Drop, Drop,
#[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992
/// The invalid/missing items are replaced by fill_value /// The invalid/missing items are replaced by fill_value
Fill(f32), Fill(f32),
#[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992
/// The invalid items are replaced by NULL /// The invalid items are replaced by NULL
None, None,
} }

View File

@@ -20,6 +20,8 @@ use datafusion_physical_plan::SendableRecordBatchStream;
lazy_static! { lazy_static! {
static ref TABLE_NAME_REGEX: regex::Regex = regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap(); static ref TABLE_NAME_REGEX: regex::Regex = regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap();
static ref NAMESPACE_NAME_REGEX: regex::Regex =
regex::Regex::new(r"^[a-zA-Z0-9_\-\.]+$").unwrap();
} }
pub trait PatchStoreParam { pub trait PatchStoreParam {
@@ -98,6 +100,53 @@ pub fn validate_table_name(name: &str) -> Result<()> {
Ok(()) Ok(())
} }
/// Validate a namespace name component
///
/// Namespace names must:
/// - Not be empty
/// - Only contain alphanumeric characters, underscores, hyphens, and periods
///
/// # Arguments
/// * `name` - A single namespace component (not the full path)
///
/// # Returns
/// * `Ok(())` if the namespace name is valid
/// * `Err(Error)` if the namespace name is invalid
pub fn validate_namespace_name(name: &str) -> Result<()> {
if name.is_empty() {
return Err(Error::InvalidInput {
message: "Namespace names cannot be empty strings".to_string(),
});
}
if !NAMESPACE_NAME_REGEX.is_match(name) {
return Err(Error::InvalidInput {
message: format!(
"Invalid namespace name '{}': Namespace names can only contain alphanumeric characters, underscores, hyphens, and periods",
name
),
});
}
Ok(())
}
/// Validate all components of a namespace
///
/// Iterates through all namespace components and validates each one.
/// Returns an error if any component is invalid.
///
/// # Arguments
/// * `namespace` - The namespace components to validate
///
/// # Returns
/// * `Ok(())` if all namespace components are valid
/// * `Err(Error)` if any component is invalid
pub fn validate_namespace(namespace: &[String]) -> Result<()> {
for component in namespace {
validate_namespace_name(component)?;
}
Ok(())
}
/// Find one default column to create index or perform vector query. /// Find one default column to create index or perform vector query.
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> { pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
// Try to find a vector column. // Try to find a vector column.
@@ -345,6 +394,61 @@ mod tests {
assert!(validate_table_name("name with space").is_err()); assert!(validate_table_name("name with space").is_err());
} }
#[test]
fn test_validate_namespace_name() {
// Valid namespace names
assert!(validate_namespace_name("ns1").is_ok());
assert!(validate_namespace_name("namespace_123").is_ok());
assert!(validate_namespace_name("my-namespace").is_ok());
assert!(validate_namespace_name("my.namespace").is_ok());
assert!(validate_namespace_name("NS_1.2.3").is_ok());
assert!(validate_namespace_name("a").is_ok());
assert!(validate_namespace_name("123").is_ok());
assert!(validate_namespace_name("_underscore").is_ok());
assert!(validate_namespace_name("-hyphen").is_ok());
assert!(validate_namespace_name(".period").is_ok());
// Invalid namespace names
assert!(validate_namespace_name("").is_err());
assert!(validate_namespace_name("namespace with spaces").is_err());
assert!(validate_namespace_name("namespace/with/slashes").is_err());
assert!(validate_namespace_name("namespace\\with\\backslashes").is_err());
assert!(validate_namespace_name("namespace$with$delimiter").is_err());
assert!(validate_namespace_name("namespace@special").is_err());
assert!(validate_namespace_name("namespace#hash").is_err());
}
#[test]
fn test_validate_namespace() {
// Valid namespace with single component
assert!(validate_namespace(&["ns1".to_string()]).is_ok());
// Valid namespace with multiple components
assert!(
validate_namespace(&["ns1".to_string(), "ns2".to_string(), "ns3".to_string()]).is_ok()
);
// Empty namespace (root) is valid
assert!(validate_namespace(&[]).is_ok());
// Invalid: contains empty component
assert!(validate_namespace(&["ns1".to_string(), "".to_string()]).is_err());
// Invalid: contains component with spaces
assert!(validate_namespace(&["ns1".to_string(), "ns 2".to_string()]).is_err());
// Invalid: contains component with special characters
assert!(validate_namespace(&["ns1".to_string(), "ns@2".to_string()]).is_err());
assert!(validate_namespace(&["ns1".to_string(), "ns/2".to_string()]).is_err());
assert!(validate_namespace(&["ns1".to_string(), "ns$2".to_string()]).is_err());
// Valid: underscores, hyphens, and periods are allowed
assert!(
validate_namespace(&["ns_1".to_string(), "ns-2".to_string(), "ns.3".to_string()])
.is_ok()
);
}
#[test] #[test]
fn test_string_to_datatype() { fn test_string_to_datatype() {
let string = "int32"; let string = "int32";

View File

@@ -341,10 +341,10 @@ impl EmbeddingFunction for MockEmbed {
fn name(&self) -> &str { fn name(&self) -> &str {
&self.name &self.name
} }
fn source_type(&self) -> Result<Cow<DataType>> { fn source_type(&self) -> Result<Cow<'_, DataType>> {
Ok(Cow::Borrowed(&self.source_type)) Ok(Cow::Borrowed(&self.source_type))
} }
fn dest_type(&self) -> Result<Cow<DataType>> { fn dest_type(&self) -> Result<Cow<'_, DataType>> {
Ok(Cow::Borrowed(&self.dest_type)) Ok(Cow::Borrowed(&self.dest_type))
} }
fn compute_source_embeddings(&self, source: Arc<dyn Array>) -> Result<Arc<dyn Array>> { fn compute_source_embeddings(&self, source: Arc<dyn Array>) -> Result<Arc<dyn Array>> {