Compare commits

...

14 Commits

Author SHA1 Message Date
Lance Release
84a6693294 Bump version: 0.17.0-beta.0 → 0.17.0-beta.1 2024-11-29 18:16:02 +00:00
Ryan Green
6c2d4c10a4 feat: support remote options for remote lancedb connection (#1895)
* Support subset of storage options as remote options
* Send Azure storage account name via HTTP header
2024-11-29 14:08:13 -03:30
Ryan Green
d914722f79 Revert "feat: support remote options for remote lancedb connection. Send Azure storage account name via HTTP header."
This reverts commit a6e4034dba.
2024-11-29 11:06:18 -03:30
Ryan Green
a6e4034dba feat: support remote options for remote lancedb connection. Send Azure storage account name via HTTP header. 2024-11-29 11:05:04 -03:30
QianZhu
2616a50502 fix: test errors after setting default limit (#1891) 2024-11-26 16:03:16 -08:00
LuQQiu
7b5e9d824a fix: dynamodb external manifest drop table (#1866)
second pr of https://github.com/lancedb/lancedb/issues/1812
2024-11-26 13:20:48 -08:00
QianZhu
3b173e7cb9 fix: default limit for remote nodejs client (#1886)
https://github.com/lancedb/lancedb/issues/1804
2024-11-26 11:01:25 -08:00
Mr. Doge
d496ab13a0 ci: linux: specify target triple for neon pack-build (vectordb) (#1889)
fixes that all `neon pack-build` packs are named
`vectordb-linux-x64-musl-*.tgz` even when cross-compiling

adds 2nd param:
`TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}`
`npm run pack-build -- -t $TARGET_TRIPLE`
2024-11-26 10:57:17 -08:00
Will Jones
69d9beebc7 docs: improve style and introduction to Python API docs (#1885)
I found the signatures difficult to read and the parameter section not
very space efficient.
2024-11-26 09:17:35 -08:00
Bert
d32360b99d feat: support overwrite and exist_ok mode for remote create_table (#1883)
Support passing modes "overwrite" and "exist_ok" when creating a remote
table.
2024-11-26 11:38:36 -05:00
Will Jones
9fa08bfa93 ci: use correct runner for vectordb (#1881)
We already do this for `gnu` builds, we should do this also for `musl`
builds.
2024-11-25 16:17:10 -08:00
LuQQiu
d6d9cb7415 feat: bump lance to 0.20.0b3 (#1882)
Bump lance version.
Upstream change log:
https://github.com/lancedb/lance/releases/tag/v0.20.0-beta.3
2024-11-25 16:15:44 -08:00
Lance Release
990d93f553 Updating package-lock.json 2024-11-25 22:06:39 +00:00
Lance Release
0832cba3c6 Bump version: 0.13.1-beta.0 → 0.14.0-beta.0 2024-11-25 22:06:14 +00:00
35 changed files with 292 additions and 69 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.13.1-beta.0"
current_version = "0.14.0-beta.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -133,7 +133,7 @@ jobs:
free -h
- name: Build Linux Artifacts
run: |
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }}
bash ci/build_linux_artifacts.sh ${{ matrix.config.arch }} ${{ matrix.config.arch }}-unknown-linux-gnu
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
@@ -143,7 +143,7 @@ jobs:
node-linux-musl:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
runs-on: ubuntu-latest
runs-on: ${{ matrix.config.runner }}
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -152,7 +152,10 @@ jobs:
matrix:
config:
- arch: x86_64
runner: ubuntu-latest
- arch: aarch64
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
runner: buildjet-16vcpu-ubuntu-2204-arm
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -185,7 +188,7 @@ jobs:
- name: Build Linux Artifacts
run: |
source ./saved_env
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }} ${{ matrix.config.arch }}-unknown-linux-musl
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
@@ -246,7 +249,7 @@ jobs:
nodejs-linux-musl:
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
runs-on: ubuntu-latest
runs-on: ${{ matrix.config.runner }}
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -255,7 +258,10 @@ jobs:
matrix:
config:
- arch: x86_64
runner: ubuntu-latest
- arch: aarch64
# For successful fat LTO builds, we need a large runner to avoid OOM errors.
runner: buildjet-16vcpu-ubuntu-2204-arm
steps:
- name: Checkout
uses: actions/checkout@v4

View File

@@ -23,13 +23,14 @@ rust-version = "1.80.0" # TO
[workspace.dependencies]
lance = { "version" = "=0.20.0", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-io = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

View File

@@ -1,8 +1,9 @@
#!/bin/bash
set -e
ARCH=${1:-x86_64}
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
# We pass down the current user so that when we later mount the local files
# We pass down the current user so that when we later mount the local files
# into the container, the files are accessible by the current user.
pushd ci/manylinux_node
docker build \
@@ -18,4 +19,4 @@ docker run \
-v $(pwd):/io -w /io \
--memory-swap=-1 \
lancedb-node-manylinux \
bash ci/manylinux_node/build_vectordb.sh $ARCH
bash ci/manylinux_node/build_vectordb.sh $ARCH $TARGET_TRIPLE

View File

@@ -2,6 +2,7 @@
# Builds the node module for manylinux. Invoked by ci/build_linux_artifacts.sh.
set -e
ARCH=${1:-x86_64}
TARGET_TRIPLE=${2:-x86_64-unknown-linux-gnu}
if [ "$ARCH" = "x86_64" ]; then
export OPENSSL_LIB_DIR=/usr/local/lib64/
@@ -17,4 +18,4 @@ FILE=$HOME/.bashrc && test -f $FILE && source $FILE
cd node
npm ci
npm run build-release
npm run pack-build
npm run pack-build -t $TARGET_TRIPLE

View File

@@ -55,6 +55,9 @@ plugins:
show_signature_annotations: true
show_root_heading: true
members_order: source
docstring_section_style: list
signature_crossrefs: true
separate_signature: true
import:
# for cross references
- https://arrow.apache.org/docs/objects.inv

View File

@@ -1,6 +1,16 @@
# Python API Reference
This section contains the API reference for the OSS Python API.
This section contains the API reference for the Python API. There is a
synchronous and an asynchronous API client.
The general flow of using the API is:
1. Use [lancedb.connect][] or [lancedb.connect_async][] to connect to a database.
2. Use the returned [lancedb.DBConnection][] or [lancedb.AsyncConnection][] to
create or open tables.
3. Use the returned [lancedb.table.Table][] or [lancedb.AsyncTable][] to query
or modify tables.
## Installation

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.1-beta.0</version>
<version>0.14.0-beta.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.1-beta.0</version>
<version>0.14.0-beta.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

20
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"cpu": [
"x64",
"arm64"
@@ -52,14 +52,14 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.14.0-beta.0",
"@lancedb/vectordb-darwin-x64": "0.14.0-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -91,13 +91,13 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0"
"@lancedb/vectordb-darwin-x64": "0.14.0-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.14.0-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.0"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.13.1-beta.0"
version = "0.14.0-beta.0"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -110,7 +110,10 @@ describe("given a connection", () => {
let table = await db.createTable("test", data, { useLegacyFormat: true });
const isV2 = async (table: Table) => {
const data = await table.query().toArrow({ maxBatchLength: 100000 });
const data = await table
.query()
.limit(10000)
.toArrow({ maxBatchLength: 100000 });
console.log(data.batches.length);
return data.batches.length < 5;
};

View File

@@ -585,11 +585,11 @@ describe("When creating an index", () => {
expect(fs.readdirSync(indexDir)).toHaveLength(1);
for await (const r of tbl.query().where("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
expect(r.numRows).toBe(10);
}
// should also work with 'filter' alias
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
expect(r.numRows).toBe(298);
expect(r.numRows).toBe(10);
}
});

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.13.1-beta.0",
"version": "0.14.0-beta.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.17.0-beta.0"
current_version = "0.17.0-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.17.0-beta.0"
version = "0.17.0-beta.1"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.20.0b2",
"pylance==0.20.0b3",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",

View File

@@ -1502,10 +1502,11 @@ class AsyncQueryBase(object):
... print(plan)
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
Parameters
----------

View File

@@ -599,7 +599,9 @@ async def test_create_in_v2_mode(tmp_path):
)
async def is_in_v2_mode(tbl):
batches = await tbl.query().to_batches(max_batch_length=1024 * 10)
batches = (
await tbl.query().limit(10 * 1024).to_batches(max_batch_length=1024 * 10)
)
num_batches = 0
async for batch in batches:
num_batches += 1

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.13.1-beta.0"
version = "0.14.0-beta.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.13.1-beta.0"
version = "0.14.0-beta.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -27,6 +27,7 @@ half = { workspace = true }
lazy_static.workspace = true
lance = { workspace = true }
lance-datafusion.workspace = true
lance-io = { workspace = true }
lance-index = { workspace = true }
lance-table = { workspace = true }
lance-linalg = { workspace = true }

View File

@@ -38,6 +38,9 @@ use crate::table::{NativeTable, TableDefinition, WriteOptions};
use crate::utils::validate_table_name;
use crate::Table;
pub use lance_encoding::version::LanceFileVersion;
#[cfg(feature = "remote")]
use lance_io::object_store::StorageOptions;
use lance_table::io::commit::commit_handler_from_url;
pub const LANCE_FILE_EXTENSION: &str = "lance";
@@ -133,7 +136,7 @@ impl IntoArrow for NoData {
/// A builder for configuring a [`Connection::create_table`] operation
pub struct CreateTableBuilder<const HAS_DATA: bool, T: IntoArrow> {
parent: Arc<dyn ConnectionInternal>,
pub(crate) parent: Arc<dyn ConnectionInternal>,
pub(crate) name: String,
pub(crate) data: Option<T>,
pub(crate) mode: CreateTableMode,
@@ -341,7 +344,7 @@ pub struct OpenTableBuilder {
}
impl OpenTableBuilder {
fn new(parent: Arc<dyn ConnectionInternal>, name: String) -> Self {
pub(crate) fn new(parent: Arc<dyn ConnectionInternal>, name: String) -> Self {
Self {
parent,
name,
@@ -717,12 +720,14 @@ impl ConnectBuilder {
message: "An api_key is required when connecting to LanceDb Cloud".to_string(),
})?;
let storage_options = StorageOptions(self.storage_options.clone());
let internal = Arc::new(crate::remote::db::RemoteDatabase::try_new(
&self.uri,
&api_key,
&region,
self.host_override,
self.client_config,
storage_options.into(),
)?);
Ok(Connection {
internal,
@@ -855,7 +860,7 @@ impl Database {
let table_base_uri = if let Some(store) = engine {
static WARN_ONCE: std::sync::Once = std::sync::Once::new();
WARN_ONCE.call_once(|| {
log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE");
log::warn!("Specifying engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE");
});
let old_scheme = url.scheme().to_string();
let new_scheme = format!("{}+{}", old_scheme, store);
@@ -1036,6 +1041,7 @@ impl ConnectionInternal for Database {
};
let mut write_params = options.write_options.lance_write_params.unwrap_or_default();
if matches!(&options.mode, CreateTableMode::Overwrite) {
write_params.mode = WriteMode::Overwrite;
}
@@ -1122,7 +1128,7 @@ impl ConnectionInternal for Database {
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
let full_path = self.base_path.child(dir_name.clone());
self.object_store
.remove_dir_all(full_path)
.remove_dir_all(full_path.clone())
.await
.map_err(|err| match err {
// this error is not lance::Error::DatasetNotFound,
@@ -1132,6 +1138,19 @@ impl ConnectionInternal for Database {
},
_ => Error::from(err),
})?;
let object_store_params = ObjectStoreParams {
storage_options: Some(self.storage_options.clone()),
..Default::default()
};
let mut uri = self.uri.clone();
if let Some(query_string) = &self.query_string {
uri.push_str(&format!("?{}", query_string));
}
let commit_handler = commit_handler_from_url(&uri, &Some(object_store_params))
.await
.unwrap();
commit_handler.delete(&full_path).await.unwrap();
Ok(())
}
@@ -1169,6 +1188,7 @@ mod tests {
use lance_testing::datagen::{BatchGenerator, IncrementingInt32};
use tempfile::tempdir;
use crate::query::QueryBase;
use crate::query::{ExecutableQuery, QueryExecutionOptions};
use super::*;
@@ -1296,6 +1316,7 @@ mod tests {
// In v1 the row group size will trump max_batch_length
let batches = tbl
.query()
.limit(20000)
.execute_with_options(QueryExecutionOptions {
max_batch_length: 50000,
..Default::default()

View File

@@ -596,7 +596,7 @@ impl Query {
pub(crate) fn new(parent: Arc<dyn TableInternal>) -> Self {
Self {
parent,
limit: None,
limit: Some(DEFAULT_TOP_K),
offset: None,
filter: None,
full_text_search: None,

View File

@@ -21,6 +21,7 @@ use reqwest::{
};
use crate::error::{Error, Result};
use crate::remote::db::RemoteOptions;
const REQUEST_ID_HEADER: &str = "x-request-id";
@@ -215,6 +216,7 @@ impl RestfulLanceDbClient<Sender> {
region: &str,
host_override: Option<String>,
client_config: ClientConfig,
options: &RemoteOptions,
) -> Result<Self> {
let parsed_url = url::Url::parse(db_url).map_err(|err| Error::InvalidInput {
message: format!("db_url is not a valid URL. '{db_url}'. Error: {err}"),
@@ -255,6 +257,7 @@ impl RestfulLanceDbClient<Sender> {
region,
db_name,
host_override.is_some(),
options,
)?)
.user_agent(client_config.user_agent)
.build()
@@ -262,6 +265,7 @@ impl RestfulLanceDbClient<Sender> {
message: "Failed to build HTTP client".into(),
source: Some(Box::new(err)),
})?;
let host = match host_override {
Some(host_override) => host_override,
None => format!("https://{}.{}.api.lancedb.com", db_name, region),
@@ -287,6 +291,7 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
region: &str,
db_name: &str,
has_host_override: bool,
options: &RemoteOptions,
) -> Result<HeaderMap> {
let mut headers = HeaderMap::new();
headers.insert(
@@ -313,6 +318,23 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
);
}
if let Some(v) = options.0.get("account_name") {
headers.insert(
"x-azure-storage-account-name",
HeaderValue::from_str(v).map_err(|_| Error::InvalidInput {
message: format!("non-ascii storage account name '{}' provided", db_name),
})?,
);
}
if let Some(v) = options.0.get("azure_storage_account_name") {
headers.insert(
"x-azure-storage-account-name",
HeaderValue::from_str(v).map_err(|_| Error::InvalidInput {
message: format!("non-ascii storage account name '{}' provided", db_name),
})?,
);
}
Ok(headers)
}

View File

@@ -12,18 +12,21 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use arrow_array::RecordBatchReader;
use async_trait::async_trait;
use http::StatusCode;
use lance_io::object_store::StorageOptions;
use moka::future::Cache;
use reqwest::header::CONTENT_TYPE;
use serde::Deserialize;
use tokio::task::spawn_blocking;
use crate::connection::{
ConnectionInternal, CreateTableBuilder, NoData, OpenTableBuilder, TableNamesBuilder,
ConnectionInternal, CreateTableBuilder, CreateTableMode, NoData, OpenTableBuilder,
TableNamesBuilder,
};
use crate::embeddings::EmbeddingRegistry;
use crate::error::Result;
@@ -52,9 +55,16 @@ impl RemoteDatabase {
region: &str,
host_override: Option<String>,
client_config: ClientConfig,
options: RemoteOptions,
) -> Result<Self> {
let client =
RestfulLanceDbClient::try_new(uri, api_key, region, host_override, client_config)?;
let client = RestfulLanceDbClient::try_new(
uri,
api_key,
region,
host_override,
client_config,
&options,
)?;
let table_cache = Cache::builder()
.time_to_live(std::time::Duration::from_secs(300))
@@ -95,6 +105,16 @@ impl<S: HttpSend> std::fmt::Display for RemoteDatabase<S> {
}
}
impl From<&CreateTableMode> for &'static str {
fn from(val: &CreateTableMode) -> Self {
match val {
CreateTableMode::Create => "create",
CreateTableMode::Overwrite => "overwrite",
CreateTableMode::ExistOk(_) => "exist_ok",
}
}
}
#[async_trait]
impl<S: HttpSend> ConnectionInternal for RemoteDatabase<S> {
async fn table_names(&self, options: TableNamesBuilder) -> Result<Vec<String>> {
@@ -133,14 +153,40 @@ impl<S: HttpSend> ConnectionInternal for RemoteDatabase<S> {
let req = self
.client
.post(&format!("/v1/table/{}/create/", options.name))
.query(&[("mode", Into::<&str>::into(&options.mode))])
.body(data_buffer)
.header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE);
let (request_id, rsp) = self.client.send(req, false).await?;
if rsp.status() == StatusCode::BAD_REQUEST {
let body = rsp.text().await.err_to_http(request_id.clone())?;
if body.contains("already exists") {
return Err(crate::Error::TableAlreadyExists { name: options.name });
return match options.mode {
CreateTableMode::Create => {
Err(crate::Error::TableAlreadyExists { name: options.name })
}
CreateTableMode::ExistOk(callback) => {
let builder = OpenTableBuilder::new(options.parent, options.name);
let builder = (callback)(builder);
builder.execute().await
}
// This should not happen, as we explicitly set the mode to overwrite and the server
// shouldn't return an error if the table already exists.
//
// However if the server is an older version that doesn't support the mode parameter,
// then we'll get the 400 response.
CreateTableMode::Overwrite => Err(crate::Error::Http {
source: format!(
"unexpected response from server for create mode overwrite: {}",
body
)
.into(),
request_id,
status_code: Some(StatusCode::BAD_REQUEST),
}),
};
} else {
return Err(crate::Error::InvalidInput { message: body });
}
@@ -206,6 +252,29 @@ impl<S: HttpSend> ConnectionInternal for RemoteDatabase<S> {
}
}
/// RemoteOptions contains a subset of StorageOptions that are compatible with Remote LanceDB connections
#[derive(Clone, Debug, Default)]
pub struct RemoteOptions(pub HashMap<String, String>);
impl RemoteOptions {
pub fn new(options: HashMap<String, String>) -> Self {
Self(options)
}
}
impl From<StorageOptions> for RemoteOptions {
fn from(options: StorageOptions) -> Self {
let supported_opts = vec!["account_name", "azure_storage_account_name"];
let mut filtered = HashMap::new();
for opt in supported_opts {
if let Some(v) = options.0.get(opt) {
filtered.insert(opt.to_string(), v.to_string());
}
}
RemoteOptions::new(filtered)
}
}
#[cfg(test)]
mod tests {
use std::sync::{Arc, OnceLock};
@@ -213,7 +282,9 @@ mod tests {
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
use arrow_schema::{DataType, Field, Schema};
use crate::connection::ConnectBuilder;
use crate::{
connection::CreateTableMode,
remote::{ARROW_STREAM_CONTENT_TYPE, JSON_CONTENT_TYPE},
Connection, Error,
};
@@ -382,6 +453,73 @@ mod tests {
);
}
#[tokio::test]
async fn test_create_table_modes() {
let test_cases = [
(None, "mode=create"),
(Some(CreateTableMode::Create), "mode=create"),
(Some(CreateTableMode::Overwrite), "mode=overwrite"),
(
Some(CreateTableMode::ExistOk(Box::new(|b| b))),
"mode=exist_ok",
),
];
for (mode, expected_query_string) in test_cases {
let conn = Connection::new_with_handler(move |request| {
assert_eq!(request.method(), &reqwest::Method::POST);
assert_eq!(request.url().path(), "/v1/table/table1/create/");
assert_eq!(request.url().query(), Some(expected_query_string));
http::Response::builder().status(200).body("").unwrap()
});
let data = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let reader = RecordBatchIterator::new([Ok(data.clone())], data.schema());
let mut builder = conn.create_table("table1", reader);
if let Some(mode) = mode {
builder = builder.mode(mode);
}
builder.execute().await.unwrap();
}
// check that the open table callback is called with exist_ok
let conn = Connection::new_with_handler(|request| match request.url().path() {
"/v1/table/table1/create/" => http::Response::builder()
.status(400)
.body("Table table1 already exists")
.unwrap(),
"/v1/table/table1/describe/" => http::Response::builder().status(200).body("").unwrap(),
_ => {
panic!("unexpected path: {:?}", request.url().path());
}
});
let data = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let called: Arc<OnceLock<bool>> = Arc::new(OnceLock::new());
let reader = RecordBatchIterator::new([Ok(data.clone())], data.schema());
let called_in_cb = called.clone();
conn.create_table("table1", reader)
.mode(CreateTableMode::ExistOk(Box::new(move |b| {
called_in_cb.clone().set(true).unwrap();
b
})))
.execute()
.await
.unwrap();
let called = *called.get().unwrap_or(&false);
assert!(called);
}
#[tokio::test]
async fn test_create_table_empty() {
let conn = Connection::new_with_handler(|request| {
@@ -436,4 +574,16 @@ mod tests {
});
conn.rename_table("table1", "table2").await.unwrap();
}
#[tokio::test]
async fn test_connect_remote_options() {
let db_uri = "db://my-container/my-prefix";
let _ = ConnectBuilder::new(db_uri)
.region("us-east-1")
.api_key("my-api-key")
.storage_options(vec![("azure_storage_account_name", "my-storage-account")])
.execute()
.await
.unwrap();
}
}

View File

@@ -1227,6 +1227,7 @@ mod tests {
"prefilter": true,
"distance_type": "l2",
"nprobes": 20,
"k": 10,
"ef": Option::<usize>::None,
"refine_factor": null,
"version": null,