mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-15 11:00:41 +00:00
fix(database): drop spurious trailing ? from listing-database URIs (#3357)
## Summary
`url::Url::query_pairs_mut()` leaves the URL with `query=Some("")` after
`.clear()` even when the input had no query string. The listing-database
connect path then captured that empty query into
`ListingDatabase::query_string`, and `table_uri()` blindly appended
`?<query>` to every per-table URI — producing URIs like
`s3://bucket/prefix/foo.lance?`.
The trailing `?` is benign for normal table operations, but it breaks
any caller that constructs a sub-path from the table URI. In particular,
MemWAL flushes write to `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`,
which `url::Url::parse` then re-parses as `path=<base table>` +
`query=/_mem_wal/...`. `Dataset::write` resolves the base table dataset,
finds it already exists, and fails with `Dataset already exists:
…_gen_1` on the very first MemTable flush (observed deterministically
against S3 across all merge_insert LSM modes; tracked in
[lance-format/lance#6713](https://github.com/lance-format/lance/pull/6715)).
## Fix
Treat `Some("")` query the same as no query when capturing
`query_string`. A real `?foo=bar` query is still propagated unchanged.
Adds a regression test covering both the empty-query and non-empty-query
paths.
## Verification
- `url::Url::parse("s3://bucket/prefix/").query()` → `None`, but after
`query_pairs_mut().clear()` → `Some("")`. Confirmed in a standalone
repro.
- Without this fix, every `table_uri()` for an `s3://`-style connection
ends with `?`, breaking MemWAL and any future sub-path consumer in the
same way.
- New unit test `test_table_uri_url_path_has_no_trailing_question_mark`
exercises both code paths.
This commit is contained in:
@@ -505,8 +505,15 @@ impl ListingDatabase {
|
||||
// Filter out the commit store query param -- it's a lancedb param
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
// Take a copy of the query string so we can propagate it to lance
|
||||
let query_string = url.query().map(|s| s.to_string());
|
||||
// Take a copy of the query string so we can propagate it to lance.
|
||||
// `query_pairs_mut()` leaves the URL with `Some("")` even when no
|
||||
// pairs survive (or none existed in the first place), so an empty
|
||||
// string here must be treated the same as "no query" — otherwise
|
||||
// every table URI ends up with a trailing `?`, which makes downstream
|
||||
// sub-paths (e.g. MemWAL gen paths) re-parse as path=<base table> +
|
||||
// query=<sub-path>, causing Lance to find the base table dataset
|
||||
// when looking up the sub-path.
|
||||
let query_string = url.query().filter(|q| !q.is_empty()).map(|s| s.to_string());
|
||||
// clear the query string so we can use the url as the base uri
|
||||
// use .set_query(None) instead of .set_query("") because the latter
|
||||
// will add a trailing '?' to the url
|
||||
@@ -2213,6 +2220,133 @@ mod tests {
|
||||
assert_eq!(uri, expected);
|
||||
}
|
||||
|
||||
/// Regression: connecting via a URL-style URI (which goes through
|
||||
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
|
||||
/// append a trailing `?` to per-table URIs when the input URI has
|
||||
/// no query string.
|
||||
///
|
||||
/// Earlier, `query_pairs_mut().clear()` left the URL with
|
||||
/// `query=Some("")`, which then propagated as a trailing `?` on
|
||||
/// every table URI. Sub-path lookups against that URI (e.g. MemWAL
|
||||
/// `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parsed as
|
||||
/// `path=<base table>` + `query=/_mem_wal/...`, causing
|
||||
/// `Dataset::write` to find the base table dataset and falsely
|
||||
/// report `Dataset already exists`.
|
||||
/// Mirrors the URL-mutation step from
|
||||
/// [`ListingDatabase::connect_with_options`] so we can assert the
|
||||
/// fix without going through filesystem setup (which is awkward
|
||||
/// across platforms — see the `file://` test below).
|
||||
fn capture_query_like_connect(input_uri: &str) -> Option<String> {
|
||||
let mut url = url::Url::parse(input_uri).unwrap();
|
||||
let mut filtered_querys = Vec::new();
|
||||
for (key, value) in url.query_pairs() {
|
||||
if key == ENGINE || key == MIRRORED_STORE {
|
||||
continue;
|
||||
}
|
||||
filtered_querys.push((key.to_string(), value.to_string()));
|
||||
}
|
||||
url.query_pairs_mut().clear();
|
||||
url.query_pairs_mut().extend_pairs(filtered_querys);
|
||||
url.query().filter(|q| !q.is_empty()).map(|s| s.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_capture_query_treats_empty_as_none() {
|
||||
// No query at all. With the bug, `query_pairs_mut()` left the
|
||||
// URL with `query=Some("")` and we used to propagate that.
|
||||
assert_eq!(
|
||||
capture_query_like_connect("s3://bucket/prefix/"),
|
||||
None,
|
||||
"empty query after mutation must be treated as no query"
|
||||
);
|
||||
|
||||
// Real query is propagated.
|
||||
assert_eq!(
|
||||
capture_query_like_connect("s3://bucket/prefix/?foo=bar"),
|
||||
Some("foo=bar".to_string())
|
||||
);
|
||||
|
||||
// lancedb-internal `engine=` is stripped; nothing remains, so
|
||||
// query_string is None — not Some("").
|
||||
assert_eq!(
|
||||
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem", ENGINE)),
|
||||
None
|
||||
);
|
||||
|
||||
// Mixed: drop `engine=`, keep the rest.
|
||||
let captured =
|
||||
capture_query_like_connect(&format!("s3://bucket/prefix/?{}=mem&foo=bar", ENGINE));
|
||||
assert_eq!(captured.as_deref(), Some("foo=bar"));
|
||||
}
|
||||
|
||||
/// Regression: connecting via a URL-style URI (which goes through
|
||||
/// `url::Url::parse` and the `query_pairs_mut()` path) must not
|
||||
/// append a trailing `?` to per-table URIs when the input URI has
|
||||
/// no query string. Sub-path lookups against such a URI (e.g.
|
||||
/// MemWAL `<table_uri>/_mem_wal/<shard>/<rand>_gen_<n>`) re-parse
|
||||
/// as `path=<base table>` + `query=/_mem_wal/...`, causing
|
||||
/// `Dataset::write` to find the base table dataset and falsely
|
||||
/// report `Dataset already exists`.
|
||||
///
|
||||
/// Skipped on Windows: `try_create_dir` does not understand
|
||||
/// `file:///C:/…` paths so `connect_with_options` fails before
|
||||
/// even reaching the URL-mutation logic. The pure URL-mutation
|
||||
/// invariant is covered by
|
||||
/// `test_capture_query_treats_empty_as_none` above, which runs
|
||||
/// on all platforms.
|
||||
#[cfg(not(windows))]
|
||||
#[tokio::test]
|
||||
async fn test_table_uri_url_path_has_no_trailing_question_mark() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let uri = format!("file://{}", tempdir.path().to_str().unwrap());
|
||||
|
||||
let request = ConnectRequest {
|
||||
uri: uri.clone(),
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
namespace_client_properties: Default::default(),
|
||||
manifest_enabled: false,
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
let db = ListingDatabase::connect_with_options(&request)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
db.query_string, None,
|
||||
"no input query → no captured query_string"
|
||||
);
|
||||
|
||||
let table_uri = db.table_uri("test").unwrap();
|
||||
assert!(
|
||||
!table_uri.ends_with('?'),
|
||||
"table_uri must not have a trailing `?`: {}",
|
||||
table_uri
|
||||
);
|
||||
assert_eq!(table_uri, format!("{}/test.lance", uri));
|
||||
|
||||
// A real query string should still be propagated.
|
||||
let with_query = format!("{}?foo=bar", uri);
|
||||
let request_with_query = ConnectRequest {
|
||||
uri: with_query,
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
namespace_client_properties: Default::default(),
|
||||
manifest_enabled: false,
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
let db_with_query = ListingDatabase::connect_with_options(&request_with_query)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(db_with_query.query_string.as_deref(), Some("foo=bar"));
|
||||
let table_uri = db_with_query.table_uri("test").unwrap();
|
||||
assert_eq!(table_uri, format!("{}/test.lance?foo=bar", uri));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_client() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
Reference in New Issue
Block a user