diff --git a/rust/lancedb/src/database/listing.rs b/rust/lancedb/src/database/listing.rs index d1831f52d..7b7657bf3 100644 --- a/rust/lancedb/src/database/listing.rs +++ b/rust/lancedb/src/database/listing.rs @@ -505,8 +505,15 @@ impl ListingDatabase { // Filter out the commit store query param -- it's a lancedb param url.query_pairs_mut().clear(); url.query_pairs_mut().extend_pairs(filtered_querys); - // Take a copy of the query string so we can propagate it to lance - let query_string = url.query().map(|s| s.to_string()); + // Take a copy of the query string so we can propagate it to lance. + // `query_pairs_mut()` leaves the URL with `Some("")` even when no + // pairs survive (or none existed in the first place), so an empty + // string here must be treated the same as "no query" — otherwise + // every table URI ends up with a trailing `?`, which makes downstream + // sub-paths (e.g. MemWAL gen paths) re-parse as path= + + // query=, causing Lance to find the base table dataset + // when looking up the sub-path. + let query_string = url.query().filter(|q| !q.is_empty()).map(|s| s.to_string()); // clear the query string so we can use the url as the base uri // use .set_query(None) instead of .set_query("") because the latter // will add a trailing '?' to the url @@ -2213,6 +2220,133 @@ mod tests { assert_eq!(uri, expected); } + /// Regression: connecting via a URL-style URI (which goes through + /// `url::Url::parse` and the `query_pairs_mut()` path) must not + /// append a trailing `?` to per-table URIs when the input URI has + /// no query string. + /// + /// Earlier, `query_pairs_mut().clear()` left the URL with + /// `query=Some("")`, which then propagated as a trailing `?` on + /// every table URI. Sub-path lookups against that URI (e.g. MemWAL + /// `/_mem_wal//_gen_`) re-parsed as + /// `path=` + `query=/_mem_wal/...`, causing + /// `Dataset::write` to find the base table dataset and falsely + /// report `Dataset already exists`. + /// Mirrors the URL-mutation step from + /// [`ListingDatabase::connect_with_options`] so we can assert the + /// fix without going through filesystem setup (which is awkward + /// across platforms — see the `file://` test below). + fn capture_query_like_connect(input_uri: &str) -> Option { + let mut url = url::Url::parse(input_uri).unwrap(); + let mut filtered_querys = Vec::new(); + for (key, value) in url.query_pairs() { + if key == ENGINE || key == MIRRORED_STORE { + continue; + } + filtered_querys.push((key.to_string(), value.to_string())); + } + url.query_pairs_mut().clear(); + url.query_pairs_mut().extend_pairs(filtered_querys); + url.query().filter(|q| !q.is_empty()).map(|s| s.to_string()) + } + + #[test] + fn test_capture_query_treats_empty_as_none() { + // No query at all. With the bug, `query_pairs_mut()` left the + // URL with `query=Some("")` and we used to propagate that. + assert_eq!( + capture_query_like_connect("s3://bucket/prefix/"), + None, + "empty query after mutation must be treated as no query" + ); + + // Real query is propagated. + assert_eq!( + capture_query_like_connect("s3://bucket/prefix/?foo=bar"), + Some("foo=bar".to_string()) + ); + + // lancedb-internal `engine=` is stripped; nothing remains, so + // query_string is None — not Some(""). + assert_eq!( + capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem", ENGINE)), + None + ); + + // Mixed: drop `engine=`, keep the rest. + let captured = + capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem&foo=bar", ENGINE)); + assert_eq!(captured.as_deref(), Some("foo=bar")); + } + + /// Regression: connecting via a URL-style URI (which goes through + /// `url::Url::parse` and the `query_pairs_mut()` path) must not + /// append a trailing `?` to per-table URIs when the input URI has + /// no query string. Sub-path lookups against such a URI (e.g. + /// MemWAL `/_mem_wal//_gen_`) re-parse + /// as `path=` + `query=/_mem_wal/...`, causing + /// `Dataset::write` to find the base table dataset and falsely + /// report `Dataset already exists`. + /// + /// Skipped on Windows: `try_create_dir` does not understand + /// `file:///C:/…` paths so `connect_with_options` fails before + /// even reaching the URL-mutation logic. The pure URL-mutation + /// invariant is covered by + /// `test_capture_query_treats_empty_as_none` above, which runs + /// on all platforms. + #[cfg(not(windows))] + #[tokio::test] + async fn test_table_uri_url_path_has_no_trailing_question_mark() { + let tempdir = tempdir().unwrap(); + let uri = format!("file://{}", tempdir.path().to_str().unwrap()); + + let request = ConnectRequest { + uri: uri.clone(), + #[cfg(feature = "remote")] + client_config: Default::default(), + options: Default::default(), + namespace_client_properties: Default::default(), + manifest_enabled: false, + read_consistency_interval: None, + session: None, + }; + let db = ListingDatabase::connect_with_options(&request) + .await + .unwrap(); + + assert_eq!( + db.query_string, None, + "no input query → no captured query_string" + ); + + let table_uri = db.table_uri("test").unwrap(); + assert!( + !table_uri.ends_with('?'), + "table_uri must not have a trailing `?`: {}", + table_uri + ); + assert_eq!(table_uri, format!("{}/test.lance", uri)); + + // A real query string should still be propagated. + let with_query = format!("{}?foo=bar", uri); + let request_with_query = ConnectRequest { + uri: with_query, + #[cfg(feature = "remote")] + client_config: Default::default(), + options: Default::default(), + namespace_client_properties: Default::default(), + manifest_enabled: false, + read_consistency_interval: None, + session: None, + }; + let db_with_query = ListingDatabase::connect_with_options(&request_with_query) + .await + .unwrap(); + assert_eq!(db_with_query.query_string.as_deref(), Some("foo=bar")); + let table_uri = db_with_query.table_uri("test").unwrap(); + assert_eq!(table_uri, format!("{}/test.lance?foo=bar", uri)); + } + #[tokio::test] async fn test_namespace_client() { let (_tempdir, db) = setup_database().await;