Files
lancedb/python/python/tests/test_s3.py
Will Jones 1d23af213b feat: expose storage options in LanceDB (#1204)
Exposes `storage_options` in LanceDB. This is provided for Python async,
Node `lancedb`, and Node `vectordb` (and Rust of course). Python
synchronous is omitted because it's not compatible with the PyArrow
filesystems we use there currently. In the future, we will move the sync
API to wrap the async one, and then it will get support for
`storage_options`.

1. Fixes #1168
2. Closes #1165
3. Closes #1082
4. Closes #439
5. Closes #897
6. Closes #642
7. Closes #281
8. Closes #114
9. Closes #990
10. Deprecating `awsCredentials` and `awsRegion`. Users are encouraged
to use `storageOptions` instead.
2024-04-10 10:12:04 -07:00

159 lines
4.7 KiB
Python

# Copyright 2024 Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import copy
import pytest
import pyarrow as pa
import lancedb
# These are all keys that are accepted by storage_options
CONFIG = {
"allow_http": "true",
"aws_access_key_id": "ACCESSKEY",
"aws_secret_access_key": "SECRETKEY",
"aws_endpoint": "http://localhost:4566",
"aws_region": "us-east-1",
}
def get_boto3_client(*args, **kwargs):
import boto3
return boto3.client(
*args,
region_name=CONFIG["aws_region"],
aws_access_key_id=CONFIG["aws_access_key_id"],
aws_secret_access_key=CONFIG["aws_secret_access_key"],
**kwargs,
)
@pytest.fixture(scope="module")
def s3_bucket():
s3 = get_boto3_client("s3", endpoint_url=CONFIG["aws_endpoint"])
bucket_name = "lance-integtest"
# if bucket exists, delete it
try:
delete_bucket(s3, bucket_name)
except s3.exceptions.NoSuchBucket:
pass
s3.create_bucket(Bucket=bucket_name)
yield bucket_name
delete_bucket(s3, bucket_name)
def delete_bucket(s3, bucket_name):
# Delete all objects first
for obj in s3.list_objects(Bucket=bucket_name).get("Contents", []):
s3.delete_object(Bucket=bucket_name, Key=obj["Key"])
s3.delete_bucket(Bucket=bucket_name)
@pytest.mark.s3_test
def test_s3_lifecycle(s3_bucket: str):
storage_options = copy.copy(CONFIG)
uri = f"s3://{s3_bucket}/test_lifecycle"
data = pa.table({"x": [1, 2, 3]})
async def test():
db = await lancedb.connect_async(uri, storage_options=storage_options)
table = await db.create_table("test", schema=data.schema)
assert await table.count_rows() == 0
table = await db.create_table("test", data, mode="overwrite")
assert await table.count_rows() == 3
await table.add(data, mode="append")
assert await table.count_rows() == 6
table = await db.open_table("test")
assert await table.count_rows() == 6
await db.drop_table("test")
await db.drop_database()
asyncio.run(test())
@pytest.fixture()
def kms_key():
kms = get_boto3_client("kms", endpoint_url=CONFIG["aws_endpoint"])
key_id = kms.create_key()["KeyMetadata"]["KeyId"]
yield key_id
kms.schedule_key_deletion(KeyId=key_id, PendingWindowInDays=7)
def validate_objects_encrypted(bucket: str, path: str, kms_key: str):
s3 = get_boto3_client("s3", endpoint_url=CONFIG["aws_endpoint"])
objects = s3.list_objects_v2(Bucket=bucket, Prefix=path)["Contents"]
for obj in objects:
info = s3.head_object(Bucket=bucket, Key=obj["Key"])
assert info["ServerSideEncryption"] == "aws:kms", (
"object %s not encrypted" % obj["Key"]
)
assert info["SSEKMSKeyId"].endswith(kms_key), (
"object %s not encrypted with correct key" % obj["Key"]
)
@pytest.mark.s3_test
def test_s3_sse(s3_bucket: str, kms_key: str):
storage_options = copy.copy(CONFIG)
uri = f"s3://{s3_bucket}/test_lifecycle"
data = pa.table({"x": [1, 2, 3]})
async def test():
# Create a table with SSE
db = await lancedb.connect_async(uri, storage_options=storage_options)
table = await db.create_table(
"table1",
schema=data.schema,
storage_options={
"aws_server_side_encryption": "aws:kms",
"aws_sse_kms_key_id": kms_key,
},
)
await table.add(data)
await table.update({"x": "1"})
path = "test_lifecycle/table1.lance"
validate_objects_encrypted(s3_bucket, path, kms_key)
# Test we can set encryption at connection level too.
db = await lancedb.connect_async(
uri,
storage_options=dict(
aws_server_side_encryption="aws:kms",
aws_sse_kms_key_id=kms_key,
**storage_options,
),
)
table = await db.create_table("table2", schema=data.schema)
await table.add(data)
await table.update({"x": "1"})
path = "test_lifecycle/table2.lance"
validate_objects_encrypted(s3_bucket, path, kms_key)
asyncio.run(test())