feat: dynamodb commit store support (#1410)

This allows users to specify URIs like:

```
s3+ddb://my_bucket/path?ddbTableName=myCommitTable
```

and it will support concurrent writes in S3.

* [x] Add dynamodb integration tests
* [x] Add modifications to get it working in Python sync API
* [x] Added section in documentation describing how to configure.

Closes #534

---------

Co-authored-by: universalmind303 <cory.grinstead@gmail.com>
This commit is contained in:
Will Jones
2024-06-28 09:30:36 -07:00
committed by GitHub
parent d6485f1215
commit 865ed99881
13 changed files with 1844 additions and 58 deletions

View File

@@ -14,6 +14,11 @@
/* eslint-disable @typescript-eslint/naming-convention */
import {
CreateTableCommand,
DeleteTableCommand,
DynamoDBClient,
} from "@aws-sdk/client-dynamodb";
import {
CreateKeyCommand,
KMSClient,
@@ -38,6 +43,7 @@ const CONFIG = {
awsAccessKeyId: "ACCESSKEY",
awsSecretAccessKey: "SECRETKEY",
awsEndpoint: "http://127.0.0.1:4566",
dynamodbEndpoint: "http://127.0.0.1:4566",
awsRegion: "us-east-1",
};
@@ -66,7 +72,6 @@ class S3Bucket {
} catch {
// It's fine if the bucket doesn't exist
}
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
await client.send(new CreateBucketCommand({ Bucket: name }));
return new S3Bucket(name);
}
@@ -79,32 +84,27 @@ class S3Bucket {
static async deleteBucket(client: S3Client, name: string) {
// Must delete all objects before we can delete the bucket
const objects = await client.send(
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
new ListObjectsV2Command({ Bucket: name }),
);
if (objects.Contents) {
for (const object of objects.Contents) {
await client.send(
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
new DeleteObjectCommand({ Bucket: name, Key: object.Key }),
);
}
}
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
await client.send(new DeleteBucketCommand({ Bucket: name }));
}
public async assertAllEncrypted(path: string, keyId: string) {
const client = S3Bucket.s3Client();
const objects = await client.send(
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
new ListObjectsV2Command({ Bucket: this.name, Prefix: path }),
);
if (objects.Contents) {
for (const object of objects.Contents) {
const metadata = await client.send(
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
new HeadObjectCommand({ Bucket: this.name, Key: object.Key }),
);
expect(metadata.ServerSideEncryption).toBe("aws:kms");
@@ -143,7 +143,6 @@ class KmsKey {
public async delete() {
const client = KmsKey.kmsClient();
// biome-ignore lint/style/useNamingConvention: we dont control s3's api
await client.send(new ScheduleKeyDeletionCommand({ KeyId: this.keyId }));
}
}
@@ -224,3 +223,91 @@ maybeDescribe("storage_options", () => {
await bucket.assertAllEncrypted("test/table2.lance", kmsKey.keyId);
});
});
class DynamoDBCommitTable {
name: string;
constructor(name: string) {
this.name = name;
}
static dynamoClient() {
return new DynamoDBClient({
region: CONFIG.awsRegion,
credentials: {
accessKeyId: CONFIG.awsAccessKeyId,
secretAccessKey: CONFIG.awsSecretAccessKey,
},
endpoint: CONFIG.awsEndpoint,
});
}
public static async create(name: string): Promise<DynamoDBCommitTable> {
const client = DynamoDBCommitTable.dynamoClient();
const command = new CreateTableCommand({
TableName: name,
AttributeDefinitions: [
{
AttributeName: "base_uri",
AttributeType: "S",
},
{
AttributeName: "version",
AttributeType: "N",
},
],
KeySchema: [
{ AttributeName: "base_uri", KeyType: "HASH" },
{ AttributeName: "version", KeyType: "RANGE" },
],
ProvisionedThroughput: {
ReadCapacityUnits: 1,
WriteCapacityUnits: 1,
},
});
await client.send(command);
return new DynamoDBCommitTable(name);
}
public async delete() {
const client = DynamoDBCommitTable.dynamoClient();
await client.send(new DeleteTableCommand({ TableName: this.name }));
}
}
maybeDescribe("DynamoDB Lock", () => {
let bucket: S3Bucket;
let commitTable: DynamoDBCommitTable;
beforeAll(async () => {
bucket = await S3Bucket.create("lancedb2");
commitTable = await DynamoDBCommitTable.create("commitTable");
});
afterAll(async () => {
await commitTable.delete();
await bucket.delete();
});
it("can be used to configure a DynamoDB table for commit log", async () => {
const uri = `s3+ddb://${bucket.name}/test?ddbTableName=${commitTable.name}`;
const db = await connect(uri, {
storageOptions: CONFIG,
readConsistencyInterval: 0,
});
const table = await db.createTable("test", [{ a: 1, b: 2 }]);
// 5 concurrent appends
const futs = Array.from({ length: 5 }, async () => {
// Open a table so each append has a separate table reference. Otherwise
// they will share the same table reference and the internal ReadWriteLock
// will prevent any real concurrency.
const table = await db.openTable("test");
await table.add([{ a: 2, b: 3 }]);
});
await Promise.all(futs);
const rowCount = await table.countRows();
expect(rowCount).toBe(6);
});
});

View File

@@ -1,5 +1,5 @@
{
"$schema": "https://biomejs.dev/schemas/1.7.3/schema.json",
"$schema": "https://biomejs.dev/schemas/1.8.3/schema.json",
"organizeImports": {
"enabled": true
},
@@ -100,6 +100,16 @@
"globals": []
},
"overrides": [
{
"include": ["__test__/s3_integration.test.ts"],
"linter": {
"rules": {
"style": {
"useNamingConvention": "off"
}
}
}
},
{
"include": [
"**/*.ts",

View File

@@ -55,7 +55,7 @@ export class RestfulLanceDBClient {
return axios.create({
baseURL: this.url,
headers: {
// biome-ignore lint/style/useNamingConvention: external api
// biome-ignore lint: external API
Authorization: `Bearer ${this.#apiKey}`,
},
transformResponse: decodeErrorData,

1391
nodejs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -34,6 +34,7 @@
"devDependencies": {
"@aws-sdk/client-kms": "^3.33.0",
"@aws-sdk/client-s3": "^3.33.0",
"@aws-sdk/client-dynamodb": "^3.33.0",
"@biomejs/biome": "^1.7.3",
"@jest/globals": "^29.7.0",
"@napi-rs/cli": "^2.18.0",
@@ -68,7 +69,7 @@
"lint-ci": "biome ci .",
"docs": "typedoc --plugin typedoc-plugin-markdown --out ../docs/src/js lancedb/index.ts",
"lint": "biome check . && biome format .",
"lint-fix": "biome check --apply-unsafe . && biome format --write .",
"lint-fix": "biome check --write . && biome format --write .",
"prepublishOnly": "napi prepublish -t npm",
"test": "jest --verbose",
"integration": "S3_TEST=1 npm run test",