mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 19:32:56 +00:00
fix(python): require explicit region for S3 buckets with dots (#2892)
When region is not specific in the s3 path, `resolve_s3_region` from
"lance-format" project (see [here][1]) will resolve the region by
calling `resolve_bucket_region`, which is a function from the
"arrow-rs-object-store" project expecting [virtual-hosted-style
URLs][1]. When there are dot (".") in the virtual-hosted-style URLs, it
breaks automatic region detection. See more details in the issue
description:
https://github.com/lancedb/lancedb/issues/1898#issuecomment-3690142427
This PR add early validation in connect() and connect_async() to raise a
clear error with instructions when the region is not specified for such
buckets.
[1]:
https://github.com/lance-format/lance/blob/v2.0.0-beta.4/rust/lance-io/src/object_store/providers/aws.rs#L197
[2]:
eedbf3d7d8/src/aws/resolve.rs (L52C5-L52C65)
[3]:
https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access
Fixes #1898
This commit is contained in:
@@ -13,6 +13,7 @@ __version__ = importlib.metadata.version("lancedb")
|
||||
|
||||
from ._lancedb import connect as lancedb_connect
|
||||
from .common import URI, sanitize_uri
|
||||
from urllib.parse import urlparse
|
||||
from .db import AsyncConnection, DBConnection, LanceDBConnection
|
||||
from .io import StorageOptionsProvider
|
||||
from .remote import ClientConfig
|
||||
@@ -28,6 +29,39 @@ from .namespace import (
|
||||
)
|
||||
|
||||
|
||||
def _check_s3_bucket_with_dots(
|
||||
uri: str, storage_options: Optional[Dict[str, str]]
|
||||
) -> None:
|
||||
"""
|
||||
Check if an S3 URI has a bucket name containing dots and warn if no region
|
||||
is specified. S3 buckets with dots cannot use virtual-hosted-style URLs,
|
||||
which breaks automatic region detection.
|
||||
|
||||
See: https://github.com/lancedb/lancedb/issues/1898
|
||||
"""
|
||||
if not isinstance(uri, str) or not uri.startswith("s3://"):
|
||||
return
|
||||
|
||||
parsed = urlparse(uri)
|
||||
bucket = parsed.netloc
|
||||
|
||||
if "." not in bucket:
|
||||
return
|
||||
|
||||
# Check if region is provided in storage_options
|
||||
region_keys = {"region", "aws_region"}
|
||||
has_region = storage_options and any(k in storage_options for k in region_keys)
|
||||
|
||||
if not has_region:
|
||||
raise ValueError(
|
||||
f"S3 bucket name '{bucket}' contains dots, which prevents automatic "
|
||||
f"region detection. Please specify the region explicitly via "
|
||||
f"storage_options={{'region': '<your-region>'}} or "
|
||||
f"storage_options={{'aws_region': '<your-region>'}}. "
|
||||
f"See https://github.com/lancedb/lancedb/issues/1898 for details."
|
||||
)
|
||||
|
||||
|
||||
def connect(
|
||||
uri: URI,
|
||||
*,
|
||||
@@ -121,9 +155,11 @@ def connect(
|
||||
storage_options=storage_options,
|
||||
**kwargs,
|
||||
)
|
||||
_check_s3_bucket_with_dots(str(uri), storage_options)
|
||||
|
||||
if kwargs:
|
||||
raise ValueError(f"Unknown keyword arguments: {kwargs}")
|
||||
|
||||
return LanceDBConnection(
|
||||
uri,
|
||||
read_consistency_interval=read_consistency_interval,
|
||||
@@ -211,6 +247,8 @@ async def connect_async(
|
||||
if isinstance(client_config, dict):
|
||||
client_config = ClientConfig(**client_config)
|
||||
|
||||
_check_s3_bucket_with_dots(str(uri), storage_options)
|
||||
|
||||
return AsyncConnection(
|
||||
await lancedb_connect(
|
||||
sanitize_uri(uri),
|
||||
|
||||
68
python/python/tests/test_s3_bucket_dots.py
Normal file
68
python/python/tests/test_s3_bucket_dots.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""
|
||||
Tests for S3 bucket names containing dots.
|
||||
|
||||
Related issue: https://github.com/lancedb/lancedb/issues/1898
|
||||
|
||||
These tests validate the early error checking for S3 bucket names with dots.
|
||||
No actual S3 connection is made - validation happens before connection.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import lancedb
|
||||
|
||||
# Test URIs
|
||||
BUCKET_WITH_DOTS = "s3://my.bucket.name/path"
|
||||
BUCKET_WITH_DOTS_AND_REGION = ("s3://my.bucket.name", {"region": "us-east-1"})
|
||||
BUCKET_WITH_DOTS_AND_AWS_REGION = ("s3://my.bucket.name", {"aws_region": "us-east-1"})
|
||||
BUCKET_WITHOUT_DOTS = "s3://my-bucket/path"
|
||||
|
||||
|
||||
class TestS3BucketWithDotsSync:
|
||||
"""Tests for connect()."""
|
||||
|
||||
def test_bucket_with_dots_requires_region(self):
|
||||
with pytest.raises(ValueError, match="contains dots"):
|
||||
lancedb.connect(BUCKET_WITH_DOTS)
|
||||
|
||||
def test_bucket_with_dots_and_region_passes(self):
|
||||
uri, opts = BUCKET_WITH_DOTS_AND_REGION
|
||||
db = lancedb.connect(uri, storage_options=opts)
|
||||
assert db is not None
|
||||
|
||||
def test_bucket_with_dots_and_aws_region_passes(self):
|
||||
uri, opts = BUCKET_WITH_DOTS_AND_AWS_REGION
|
||||
db = lancedb.connect(uri, storage_options=opts)
|
||||
assert db is not None
|
||||
|
||||
def test_bucket_without_dots_passes(self):
|
||||
db = lancedb.connect(BUCKET_WITHOUT_DOTS)
|
||||
assert db is not None
|
||||
|
||||
|
||||
class TestS3BucketWithDotsAsync:
|
||||
"""Tests for connect_async()."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bucket_with_dots_requires_region(self):
|
||||
with pytest.raises(ValueError, match="contains dots"):
|
||||
await lancedb.connect_async(BUCKET_WITH_DOTS)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bucket_with_dots_and_region_passes(self):
|
||||
uri, opts = BUCKET_WITH_DOTS_AND_REGION
|
||||
db = await lancedb.connect_async(uri, storage_options=opts)
|
||||
assert db is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bucket_with_dots_and_aws_region_passes(self):
|
||||
uri, opts = BUCKET_WITH_DOTS_AND_AWS_REGION
|
||||
db = await lancedb.connect_async(uri, storage_options=opts)
|
||||
assert db is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bucket_without_dots_passes(self):
|
||||
db = await lancedb.connect_async(BUCKET_WITHOUT_DOTS)
|
||||
assert db is not None
|
||||
Reference in New Issue
Block a user