From cd5f91bb7df032389b055960cb32471218b62c72 Mon Sep 17 00:00:00 2001 From: Ryan Green Date: Tue, 20 Jan 2026 19:56:46 -0330 Subject: [PATCH] feat: expose table uri (#2922) * Expose `table.uri` property for all tables, including remote tables * Fix bug in path calculation on windows file systems --- python/python/lancedb/_lancedb.pyi | 1 + python/python/lancedb/remote/table.py | 8 +++ python/python/lancedb/table.py | 18 ++++++ python/python/tests/test_table.py | 6 ++ python/src/table.rs | 5 ++ rust/lancedb/src/database/listing.rs | 31 ++++++++- rust/lancedb/src/remote/table.rs | 93 ++++++++++++++++++++++++++- rust/lancedb/src/table.rs | 17 ++--- 8 files changed, 166 insertions(+), 13 deletions(-) diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 2a3260417..f85b6673a 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -179,6 +179,7 @@ class Table: cleanup_since_ms: Optional[int] = None, delete_unverified: Optional[bool] = None, ) -> OptimizeStats: ... + async def uri(self) -> str: ... @property def tags(self) -> Tags: ... def query(self) -> Query: ... diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 7961995c6..e95d3d951 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -655,6 +655,14 @@ class RemoteTable(Table): def stats(self): return LOOP.run(self._table.stats()) + @property + def uri(self) -> str: + """The table URI (storage location). + + For remote tables, this fetches the location from the server via describe. + """ + return LOOP.run(self._table.uri()) + def take_offsets(self, offsets: list[int]) -> LanceTakeQueryBuilder: return LanceTakeQueryBuilder(self._table.take_offsets(offsets)) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 72451d2be..b18d6e032 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2218,6 +2218,10 @@ class LanceTable(Table): def stats(self) -> TableStatistics: return LOOP.run(self._table.stats()) + @property + def uri(self) -> str: + return LOOP.run(self._table.uri()) + def create_scalar_index( self, column: str, @@ -3606,6 +3610,20 @@ class AsyncTable: """ return await self._inner.stats() + async def uri(self) -> str: + """ + Get the table URI (storage location). + + For remote tables, this fetches the location from the server via describe. + For local tables, this returns the dataset URI. + + Returns + ------- + str + The full storage location of the table (e.g., S3/GCS path). + """ + return await self._inner.uri() + async def add( self, data: DATA, diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index f145d3c42..e7e0e5695 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -1967,3 +1967,9 @@ def test_add_table_with_empty_embeddings(tmp_path): on_bad_vectors="drop", ) assert table.count_rows() == 1 + + +def test_table_uri(tmp_path): + db = lancedb.connect(tmp_path) + table = db.create_table("my_table", data=[{"x": 0}]) + assert table.uri == str(tmp_path / "my_table.lance") diff --git a/python/src/table.rs b/python/src/table.rs index 6d75eae71..088712c4b 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -504,6 +504,11 @@ impl Table { }) } + pub fn uri(self_: PyRef<'_, Self>) -> PyResult> { + let inner = self_.inner_ref()?.clone(); + future_into_py(self_.py(), async move { inner.uri().await.infer_error() }) + } + pub fn __repr__(&self) -> String { match &self.inner { None => format!("ClosedTable({})", self.name), diff --git a/rust/lancedb/src/database/listing.rs b/rust/lancedb/src/database/listing.rs index 626baa46f..8edb505ef 100644 --- a/rust/lancedb/src/database/listing.rs +++ b/rust/lancedb/src/database/listing.rs @@ -463,9 +463,20 @@ impl ListingDatabase { validate_table_name(name)?; let mut uri = self.uri.clone(); - // If the URI does not end with a slash, add one - if !uri.ends_with('/') { - uri.push('/'); + // If the URI does not end with a path separator, add one + // Use forward slash for URIs (http://, s3://, gs://, file://, etc.) + // Use platform-specific separator for local paths without scheme + let has_scheme = uri.contains("://"); + let ends_with_separator = uri.ends_with('/') || uri.ends_with('\\'); + + if !ends_with_separator { + if has_scheme { + // URIs always use forward slash + uri.push('/'); + } else { + // Local path without scheme - use platform separator + uri.push(std::path::MAIN_SEPARATOR); + } } // Append the table name with the lance file extension uri.push_str(&format!("{}.{}", name, LANCE_FILE_EXTENSION)); @@ -1071,6 +1082,7 @@ mod tests { use crate::table::{Table, TableDefinition}; use arrow_array::{Int32Array, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema}; + use std::path::PathBuf; use tempfile::tempdir; async fn setup_database() -> (tempfile::TempDir, ListingDatabase) { @@ -2046,6 +2058,19 @@ mod tests { assert_eq!(db_options.new_table_config.enable_stable_row_ids, None); } + #[tokio::test] + async fn test_table_uri() { + let (_tempdir, db) = setup_database().await; + + let mut pb = PathBuf::new(); + pb.push(db.uri.clone()); + pb.push("test.lance"); + + let expected = pb.to_str().unwrap(); + let uri = db.table_uri("test").ok().unwrap(); + assert_eq!(uri, expected); + } + #[tokio::test] async fn test_namespace_client() { let (_tempdir, db) = setup_database().await; diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 2a941144e..c3339d3c5 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -204,6 +204,7 @@ pub struct RemoteTable { server_version: ServerVersion, version: RwLock>, + location: RwLock>, } impl RemoteTable { @@ -221,6 +222,7 @@ impl RemoteTable { identifier, server_version, version: RwLock::new(None), + location: RwLock::new(None), } } @@ -639,6 +641,7 @@ impl RemoteTable { struct TableDescription { version: u64, schema: JsonSchema, + location: Option, } impl std::fmt::Display for RemoteTable { @@ -667,6 +670,7 @@ mod test_utils { identifier: name, server_version: version.map(ServerVersion).unwrap_or_default(), version: RwLock::new(None), + location: RwLock::new(None), } } } @@ -1461,8 +1465,28 @@ impl BaseTable for RemoteTable { message: "table_definition is not supported on LanceDB cloud.".into(), }) } - fn dataset_uri(&self) -> &str { - "NOT_SUPPORTED" + async fn uri(&self) -> Result { + // Check if we already have the location cached + { + let location = self.location.read().await; + if let Some(ref loc) = *location { + return Ok(loc.clone()); + } + } + + // Fetch from server via describe + let description = self.describe().await?; + let location = description.location.ok_or_else(|| Error::NotSupported { + message: "Table URI not supported by the server".into(), + })?; + + // Cache the location for future use + { + let mut cached_location = self.location.write().await; + *cached_location = Some(location.clone()); + } + + Ok(location) } async fn storage_options(&self) -> Option> { @@ -3332,4 +3356,69 @@ mod tests { let result = table.drop_columns(&["old_col1", "old_col2"]).await.unwrap(); assert_eq!(result.version, 5); } + + #[tokio::test] + async fn test_uri() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!(request.method(), "POST"); + assert_eq!(request.url().path(), "/v1/table/my_table/describe/"); + + http::Response::builder() + .status(200) + .body(r#"{"version": 1, "schema": {"fields": []}, "location": "s3://bucket/path/to/table"}"#) + .unwrap() + }); + + let uri = table.uri().await.unwrap(); + assert_eq!(uri, "s3://bucket/path/to/table"); + } + + #[tokio::test] + async fn test_uri_missing_location() { + let table = Table::new_with_handler("my_table", |request| { + assert_eq!(request.method(), "POST"); + assert_eq!(request.url().path(), "/v1/table/my_table/describe/"); + + // Server returns response without location field + http::Response::builder() + .status(200) + .body(r#"{"version": 1, "schema": {"fields": []}}"#) + .unwrap() + }); + + let result = table.uri().await; + assert!(result.is_err()); + assert!(matches!(&result, Err(Error::NotSupported { .. }))); + } + + #[tokio::test] + async fn test_uri_caching() { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + let call_count = Arc::new(AtomicUsize::new(0)); + let call_count_clone = call_count.clone(); + + let table = Table::new_with_handler("my_table", move |request| { + assert_eq!(request.url().path(), "/v1/table/my_table/describe/"); + call_count_clone.fetch_add(1, Ordering::SeqCst); + + http::Response::builder() + .status(200) + .body( + r#"{"version": 1, "schema": {"fields": []}, "location": "gs://bucket/table"}"#, + ) + .unwrap() + }); + + // First call should fetch from server + let uri1 = table.uri().await.unwrap(); + assert_eq!(uri1, "gs://bucket/table"); + assert_eq!(call_count.load(Ordering::SeqCst), 1); + + // Second call should use cached value + let uri2 = table.uri().await.unwrap(); + assert_eq!(uri2, "gs://bucket/table"); + assert_eq!(call_count.load(Ordering::SeqCst), 1); // Still 1, no new call + } } diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 60421905d..9d7148b14 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -608,8 +608,8 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync { async fn list_versions(&self) -> Result>; /// Get the table definition. async fn table_definition(&self) -> Result; - /// Get the table URI - fn dataset_uri(&self) -> &str; + /// Get the table URI (storage location) + async fn uri(&self) -> Result; /// Get the storage options used when opening this table, if any. async fn storage_options(&self) -> Option>; /// Poll until the columns are fully indexed. Will return Error::Timeout if the columns @@ -1317,11 +1317,12 @@ impl Table { self.inner.list_indices().await } - /// Get the underlying dataset URI + /// Get the table URI (storage location) /// - /// Warning: This is an internal API and the return value is subject to change. - pub fn dataset_uri(&self) -> &str { - self.inner.dataset_uri() + /// Returns the full storage location of the table (e.g., S3/GCS path). + /// For remote tables, this fetches the location from the server via describe. + pub async fn uri(&self) -> Result { + self.inner.uri().await } /// Get the storage options used when opening this table, if any. @@ -3234,8 +3235,8 @@ impl BaseTable for NativeTable { Ok(results.into_iter().flatten().collect()) } - fn dataset_uri(&self) -> &str { - self.uri.as_str() + async fn uri(&self) -> Result { + Ok(self.uri.clone()) } async fn storage_options(&self) -> Option> {