diff --git a/python/python/lancedb/__init__.py b/python/python/lancedb/__init__.py index fc2613c5..2fee6bb8 100644 --- a/python/python/lancedb/__init__.py +++ b/python/python/lancedb/__init__.py @@ -13,6 +13,7 @@ __version__ = importlib.metadata.version("lancedb") from ._lancedb import connect as lancedb_connect from .common import URI, sanitize_uri +from urllib.parse import urlparse from .db import AsyncConnection, DBConnection, LanceDBConnection from .io import StorageOptionsProvider from .remote import ClientConfig @@ -28,6 +29,39 @@ from .namespace import ( ) +def _check_s3_bucket_with_dots( + uri: str, storage_options: Optional[Dict[str, str]] +) -> None: + """ + Check if an S3 URI has a bucket name containing dots and warn if no region + is specified. S3 buckets with dots cannot use virtual-hosted-style URLs, + which breaks automatic region detection. + + See: https://github.com/lancedb/lancedb/issues/1898 + """ + if not isinstance(uri, str) or not uri.startswith("s3://"): + return + + parsed = urlparse(uri) + bucket = parsed.netloc + + if "." not in bucket: + return + + # Check if region is provided in storage_options + region_keys = {"region", "aws_region"} + has_region = storage_options and any(k in storage_options for k in region_keys) + + if not has_region: + raise ValueError( + f"S3 bucket name '{bucket}' contains dots, which prevents automatic " + f"region detection. Please specify the region explicitly via " + f"storage_options={{'region': ''}} or " + f"storage_options={{'aws_region': ''}}. " + f"See https://github.com/lancedb/lancedb/issues/1898 for details." + ) + + def connect( uri: URI, *, @@ -121,9 +155,11 @@ def connect( storage_options=storage_options, **kwargs, ) + _check_s3_bucket_with_dots(str(uri), storage_options) if kwargs: raise ValueError(f"Unknown keyword arguments: {kwargs}") + return LanceDBConnection( uri, read_consistency_interval=read_consistency_interval, @@ -211,6 +247,8 @@ async def connect_async( if isinstance(client_config, dict): client_config = ClientConfig(**client_config) + _check_s3_bucket_with_dots(str(uri), storage_options) + return AsyncConnection( await lancedb_connect( sanitize_uri(uri), diff --git a/python/python/tests/test_s3_bucket_dots.py b/python/python/tests/test_s3_bucket_dots.py new file mode 100644 index 00000000..5c087f80 --- /dev/null +++ b/python/python/tests/test_s3_bucket_dots.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The LanceDB Authors + +""" +Tests for S3 bucket names containing dots. + +Related issue: https://github.com/lancedb/lancedb/issues/1898 + +These tests validate the early error checking for S3 bucket names with dots. +No actual S3 connection is made - validation happens before connection. +""" + +import pytest +import lancedb + +# Test URIs +BUCKET_WITH_DOTS = "s3://my.bucket.name/path" +BUCKET_WITH_DOTS_AND_REGION = ("s3://my.bucket.name", {"region": "us-east-1"}) +BUCKET_WITH_DOTS_AND_AWS_REGION = ("s3://my.bucket.name", {"aws_region": "us-east-1"}) +BUCKET_WITHOUT_DOTS = "s3://my-bucket/path" + + +class TestS3BucketWithDotsSync: + """Tests for connect().""" + + def test_bucket_with_dots_requires_region(self): + with pytest.raises(ValueError, match="contains dots"): + lancedb.connect(BUCKET_WITH_DOTS) + + def test_bucket_with_dots_and_region_passes(self): + uri, opts = BUCKET_WITH_DOTS_AND_REGION + db = lancedb.connect(uri, storage_options=opts) + assert db is not None + + def test_bucket_with_dots_and_aws_region_passes(self): + uri, opts = BUCKET_WITH_DOTS_AND_AWS_REGION + db = lancedb.connect(uri, storage_options=opts) + assert db is not None + + def test_bucket_without_dots_passes(self): + db = lancedb.connect(BUCKET_WITHOUT_DOTS) + assert db is not None + + +class TestS3BucketWithDotsAsync: + """Tests for connect_async().""" + + @pytest.mark.asyncio + async def test_bucket_with_dots_requires_region(self): + with pytest.raises(ValueError, match="contains dots"): + await lancedb.connect_async(BUCKET_WITH_DOTS) + + @pytest.mark.asyncio + async def test_bucket_with_dots_and_region_passes(self): + uri, opts = BUCKET_WITH_DOTS_AND_REGION + db = await lancedb.connect_async(uri, storage_options=opts) + assert db is not None + + @pytest.mark.asyncio + async def test_bucket_with_dots_and_aws_region_passes(self): + uri, opts = BUCKET_WITH_DOTS_AND_AWS_REGION + db = await lancedb.connect_async(uri, storage_options=opts) + assert db is not None + + @pytest.mark.asyncio + async def test_bucket_without_dots_passes(self): + db = await lancedb.connect_async(BUCKET_WITHOUT_DOTS) + assert db is not None