feat: expose storage options in LanceDB (#1204)

Exposes `storage_options` in LanceDB. This is provided for Python async,
Node `lancedb`, and Node `vectordb` (and Rust of course). Python
synchronous is omitted because it's not compatible with the PyArrow
filesystems we use there currently. In the future, we will move the sync
API to wrap the async one, and then it will get support for
`storage_options`.

1. Fixes #1168
2. Closes #1165
3. Closes #1082
4. Closes #439
5. Closes #897
6. Closes #642
7. Closes #281
8. Closes #114
9. Closes #990
10. Deprecating `awsCredentials` and `awsRegion`. Users are encouraged
to use `storageOptions` instead.
This commit is contained in:
Will Jones
2024-04-10 10:12:04 -07:00
committed by GitHub
parent 25dea4e859
commit 1d23af213b
31 changed files with 3128 additions and 262 deletions

View File

@@ -0,0 +1,219 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* eslint-disable @typescript-eslint/naming-convention */
import { connect } from "../dist";
import {
CreateBucketCommand,
DeleteBucketCommand,
DeleteObjectCommand,
HeadObjectCommand,
ListObjectsV2Command,
S3Client,
} from "@aws-sdk/client-s3";
import {
CreateKeyCommand,
ScheduleKeyDeletionCommand,
KMSClient,
} from "@aws-sdk/client-kms";
// Skip these tests unless the S3_TEST environment variable is set
const maybeDescribe = process.env.S3_TEST ? describe : describe.skip;
// These are all keys that are accepted by storage_options
const CONFIG = {
allowHttp: "true",
awsAccessKeyId: "ACCESSKEY",
awsSecretAccessKey: "SECRETKEY",
awsEndpoint: "http://127.0.0.1:4566",
awsRegion: "us-east-1",
};
class S3Bucket {
name: string;
constructor(name: string) {
this.name = name;
}
static s3Client() {
return new S3Client({
region: CONFIG.awsRegion,
credentials: {
accessKeyId: CONFIG.awsAccessKeyId,
secretAccessKey: CONFIG.awsSecretAccessKey,
},
endpoint: CONFIG.awsEndpoint,
});
}
public static async create(name: string): Promise<S3Bucket> {
const client = this.s3Client();
// Delete the bucket if it already exists
try {
await this.deleteBucket(client, name);
} catch (e) {
// It's fine if the bucket doesn't exist
}
await client.send(new CreateBucketCommand({ Bucket: name }));
return new S3Bucket(name);
}
public async delete() {
const client = S3Bucket.s3Client();
await S3Bucket.deleteBucket(client, this.name);
}
static async deleteBucket(client: S3Client, name: string) {
// Must delete all objects before we can delete the bucket
const objects = await client.send(
new ListObjectsV2Command({ Bucket: name }),
);
if (objects.Contents) {
for (const object of objects.Contents) {
await client.send(
new DeleteObjectCommand({ Bucket: name, Key: object.Key }),
);
}
}
await client.send(new DeleteBucketCommand({ Bucket: name }));
}
public async assertAllEncrypted(path: string, keyId: string) {
const client = S3Bucket.s3Client();
const objects = await client.send(
new ListObjectsV2Command({ Bucket: this.name, Prefix: path }),
);
if (objects.Contents) {
for (const object of objects.Contents) {
const metadata = await client.send(
new HeadObjectCommand({ Bucket: this.name, Key: object.Key }),
);
expect(metadata.ServerSideEncryption).toBe("aws:kms");
expect(metadata.SSEKMSKeyId).toContain(keyId);
}
}
}
}
class KmsKey {
keyId: string;
constructor(keyId: string) {
this.keyId = keyId;
}
static kmsClient() {
return new KMSClient({
region: CONFIG.awsRegion,
credentials: {
accessKeyId: CONFIG.awsAccessKeyId,
secretAccessKey: CONFIG.awsSecretAccessKey,
},
endpoint: CONFIG.awsEndpoint,
});
}
public static async create(): Promise<KmsKey> {
const client = this.kmsClient();
const key = await client.send(new CreateKeyCommand({}));
const keyId = key?.KeyMetadata?.KeyId;
if (!keyId) {
throw new Error("Failed to create KMS key");
}
return new KmsKey(keyId);
}
public async delete() {
const client = KmsKey.kmsClient();
await client.send(new ScheduleKeyDeletionCommand({ KeyId: this.keyId }));
}
}
maybeDescribe("storage_options", () => {
let bucket: S3Bucket;
let kmsKey: KmsKey;
beforeAll(async () => {
bucket = await S3Bucket.create("lancedb");
kmsKey = await KmsKey.create();
});
afterAll(async () => {
await kmsKey.delete();
await bucket.delete();
});
it("can be used to configure auth and endpoints", async () => {
const uri = `s3://${bucket.name}/test`;
const db = await connect(uri, { storageOptions: CONFIG });
let table = await db.createTable("test", [{ a: 1, b: 2 }]);
let rowCount = await table.countRows();
expect(rowCount).toBe(1);
let tableNames = await db.tableNames();
expect(tableNames).toEqual(["test"]);
table = await db.openTable("test");
rowCount = await table.countRows();
expect(rowCount).toBe(1);
await table.add([
{ a: 2, b: 3 },
{ a: 3, b: 4 },
]);
rowCount = await table.countRows();
expect(rowCount).toBe(3);
await db.dropTable("test");
tableNames = await db.tableNames();
expect(tableNames).toEqual([]);
});
it("can configure encryption at connection and table level", async () => {
const uri = `s3://${bucket.name}/test`;
let db = await connect(uri, { storageOptions: CONFIG });
let table = await db.createTable("table1", [{ a: 1, b: 2 }], {
storageOptions: {
awsServerSideEncryption: "aws:kms",
awsSseKmsKeyId: kmsKey.keyId,
},
});
let rowCount = await table.countRows();
expect(rowCount).toBe(1);
await table.add([{ a: 2, b: 3 }]);
await bucket.assertAllEncrypted("test/table1.lance", kmsKey.keyId);
// Now with encryption settings at connection level
db = await connect(uri, {
storageOptions: {
...CONFIG,
awsServerSideEncryption: "aws:kms",
awsSseKmsKeyId: kmsKey.keyId,
},
});
table = await db.createTable("table2", [{ a: 1, b: 2 }]);
rowCount = await table.countRows();
expect(rowCount).toBe(1);
await table.add([{ a: 2, b: 3 }]);
await bucket.assertAllEncrypted("test/table2.lance", kmsKey.keyId);
});
});

View File

@@ -13,10 +13,32 @@
// limitations under the License.
import { fromTableToBuffer, makeArrowTable, makeEmptyTable } from "./arrow";
import { Connection as LanceDbConnection } from "./native";
import { ConnectionOptions, Connection as LanceDbConnection } from "./native";
import { Table } from "./table";
import { Table as ArrowTable, Schema } from "apache-arrow";
/**
* Connect to a LanceDB instance at the given URI.
*
* Accpeted formats:
*
* - `/path/to/database` - local database
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
* - `db://host:port` - remote database (LanceDB cloud)
* @param {string} uri - The uri of the database. If the database uri starts
* with `db://` then it connects to a remote database.
* @see {@link ConnectionOptions} for more details on the URI format.
*/
export async function connect(
uri: string,
opts?: Partial<ConnectionOptions>,
): Promise<Connection> {
opts = opts ?? {};
opts.storageOptions = cleanseStorageOptions(opts.storageOptions);
const nativeConn = await LanceDbConnection.new(uri, opts);
return new Connection(nativeConn);
}
export interface CreateTableOptions {
/**
* The mode to use when creating the table.
@@ -33,6 +55,28 @@ export interface CreateTableOptions {
* then no error will be raised.
*/
existOk: boolean;
/**
* Configuration for object storage.
*
* Options already set on the connection will be inherited by the table,
* but can be overridden here.
*
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
*/
storageOptions?: Record<string, string>;
}
export interface OpenTableOptions {
/**
* Configuration for object storage.
*
* Options already set on the connection will be inherited by the table,
* but can be overridden here.
*
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
*/
storageOptions?: Record<string, string>;
}
export interface TableNamesOptions {
@@ -109,8 +153,14 @@ export class Connection {
* Open a table in the database.
* @param {string} name - The name of the table
*/
async openTable(name: string): Promise<Table> {
const innerTable = await this.inner.openTable(name);
async openTable(
name: string,
options?: Partial<OpenTableOptions>,
): Promise<Table> {
const innerTable = await this.inner.openTable(
name,
cleanseStorageOptions(options?.storageOptions),
);
return new Table(innerTable);
}
@@ -139,7 +189,12 @@ export class Connection {
table = makeArrowTable(data);
}
const buf = await fromTableToBuffer(table);
const innerTable = await this.inner.createTable(name, buf, mode);
const innerTable = await this.inner.createTable(
name,
buf,
mode,
cleanseStorageOptions(options?.storageOptions),
);
return new Table(innerTable);
}
@@ -162,7 +217,12 @@ export class Connection {
const table = makeEmptyTable(schema);
const buf = await fromTableToBuffer(table);
const innerTable = await this.inner.createEmptyTable(name, buf, mode);
const innerTable = await this.inner.createEmptyTable(
name,
buf,
mode,
cleanseStorageOptions(options?.storageOptions),
);
return new Table(innerTable);
}
@@ -174,3 +234,43 @@ export class Connection {
return this.inner.dropTable(name);
}
}
/**
* Takes storage options and makes all the keys snake case.
*/
function cleanseStorageOptions(
options?: Record<string, string>,
): Record<string, string> | undefined {
if (options === undefined) {
return undefined;
}
const result: Record<string, string> = {};
for (const [key, value] of Object.entries(options)) {
if (value !== undefined) {
const newKey = camelToSnakeCase(key);
result[newKey] = value;
}
}
return result;
}
/**
* Convert a string to snake case. It might already be snake case, in which case it is
* returned unchanged.
*/
function camelToSnakeCase(camel: string): string {
if (camel.includes("_")) {
// Assume if there is at least one underscore, it is already snake case
return camel;
}
if (camel.toLocaleUpperCase() === camel) {
// Assume if the string is all uppercase, it is already snake case
return camel;
}
let result = camel.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
if (result.startsWith("_")) {
result = result.slice(1);
}
return result;
}

View File

@@ -12,12 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { Connection } from "./connection";
import {
Connection as LanceDbConnection,
ConnectionOptions,
} from "./native.js";
export {
WriteOptions,
WriteMode,
@@ -32,6 +26,7 @@ export {
VectorColumnOptions,
} from "./arrow";
export {
connect,
Connection,
CreateTableOptions,
TableNamesOptions,
@@ -46,24 +41,3 @@ export {
export { Index, IndexOptions, IvfPqOptions } from "./indices";
export { Table, AddDataOptions, IndexConfig, UpdateOptions } from "./table";
export * as embedding from "./embedding";
/**
* Connect to a LanceDB instance at the given URI.
*
* Accpeted formats:
*
* - `/path/to/database` - local database
* - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud storage
* - `db://host:port` - remote database (LanceDB cloud)
* @param {string} uri - The uri of the database. If the database uri starts
* with `db://` then it connects to a remote database.
* @see {@link ConnectionOptions} for more details on the URI format.
*/
export async function connect(
uri: string,
opts?: Partial<ConnectionOptions>,
): Promise<Connection> {
opts = opts ?? {};
const nativeConn = await LanceDbConnection.new(uri, opts);
return new Connection(nativeConn);
}

1636
nodejs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,8 @@
},
"license": "Apache 2.0",
"devDependencies": {
"@aws-sdk/client-s3": "^3.33.0",
"@aws-sdk/client-kms": "^3.33.0",
"@napi-rs/cli": "^2.18.0",
"@types/jest": "^29.1.2",
"@types/tmp": "^0.2.6",
@@ -63,6 +65,7 @@
"lint": "eslint lancedb && eslint __test__",
"prepublishOnly": "napi prepublish -t npm",
"test": "npm run build && jest --verbose",
"integration": "S3_TEST=1 npm run test",
"universal": "napi universal",
"version": "napi version"
},

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use napi::bindgen_prelude::*;
use napi_derive::*;
@@ -64,6 +66,11 @@ impl Connection {
builder =
builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval));
}
if let Some(storage_options) = options.storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
Ok(Self::inner_new(
builder
.execute()
@@ -118,14 +125,18 @@ impl Connection {
name: String,
buf: Buffer,
mode: String,
storage_options: Option<HashMap<String, String>>,
) -> napi::Result<Table> {
let batches = ipc_file_to_batches(buf.to_vec())
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
let mode = Self::parse_create_mode_str(&mode)?;
let tbl = self
.get_inner()?
.create_table(&name, batches)
.mode(mode)
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
if let Some(storage_options) = storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
@@ -138,15 +149,22 @@ impl Connection {
name: String,
schema_buf: Buffer,
mode: String,
storage_options: Option<HashMap<String, String>>,
) -> napi::Result<Table> {
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e))
})?;
let mode = Self::parse_create_mode_str(&mode)?;
let tbl = self
let mut builder = self
.get_inner()?
.create_empty_table(&name, schema)
.mode(mode)
.mode(mode);
if let Some(storage_options) = storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
@@ -154,10 +172,18 @@ impl Connection {
}
#[napi]
pub async fn open_table(&self, name: String) -> napi::Result<Table> {
let tbl = self
.get_inner()?
.open_table(&name)
pub async fn open_table(
&self,
name: String,
storage_options: Option<HashMap<String, String>>,
) -> napi::Result<Table> {
let mut builder = self.get_inner()?.open_table(&name);
if let Some(storage_options) = storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);
}
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;

View File

@@ -12,7 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use connection::Connection;
use std::collections::HashMap;
use napi_derive::*;
mod connection;
@@ -38,6 +39,10 @@ pub struct ConnectionOptions {
/// Note: this consistency only applies to read operations. Write operations are
/// always consistent.
pub read_consistency_interval: Option<f64>,
/// (For LanceDB OSS only): configuration for object storage.
///
/// The available options are described at https://lancedb.github.io/lancedb/guides/storage/
pub storage_options: Option<HashMap<String, String>>,
}
/// Write mode for writing a table.
@@ -54,7 +59,7 @@ pub struct WriteOptions {
pub mode: Option<WriteMode>,
}
#[napi]
pub async fn connect(uri: String, options: ConnectionOptions) -> napi::Result<Connection> {
Connection::new(uri, options).await
#[napi(object)]
pub struct OpenTableOptions {
pub storage_options: Option<HashMap<String, String>>,
}