diff --git a/Cargo.lock b/Cargo.lock index fec2d0c5a..1db3d8485 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3296,8 +3296,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4518,8 +4518,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arc-swap", "arrow", @@ -4537,6 +4537,7 @@ dependencies = [ "async_cell", "aws-credential-types", "aws-sdk-dynamodb", + "bitpacking", "byteorder", "bytes", "chrono", @@ -4566,6 +4567,7 @@ dependencies = [ "lance-table", "lance-tokenizer", "log", + "moka", "object_store", "permutation", "pin-project", @@ -4589,8 +4591,8 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-array", "arrow-buffer", @@ -4610,8 +4612,8 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrayref", "paste", @@ -4620,8 +4622,8 @@ dependencies = [ [[package]] name = "lance-core" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-array", "arrow-buffer", @@ -4656,8 +4658,8 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "arrow-array", @@ -4687,8 +4689,8 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "arrow-array", @@ -4706,8 +4708,8 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-arith", "arrow-array", @@ -4742,8 +4744,8 @@ dependencies = [ [[package]] name = "lance-file" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-arith", "arrow-array", @@ -4774,8 +4776,8 @@ dependencies = [ [[package]] name = "lance-index" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arc-swap", "arrow", @@ -4839,8 +4841,8 @@ dependencies = [ [[package]] name = "lance-io" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "arrow-arith", @@ -4882,8 +4884,8 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-array", "arrow-buffer", @@ -4899,8 +4901,8 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "async-trait", @@ -4912,8 +4914,8 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "arrow-ipc", @@ -4948,9 +4950,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f65e31bdaa13e01dab6e7cf566da31df243c34a542f0d915d3601ec0e01e61d2" +checksum = "6369eee4682fb11edf538388b43c61ce288b8302fe89bb40944d7daa7faaae99" dependencies = [ "reqwest 0.12.28", "serde", @@ -4962,8 +4964,8 @@ dependencies = [ [[package]] name = "lance-table" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow", "arrow-array", @@ -5002,8 +5004,8 @@ dependencies = [ [[package]] name = "lance-testing" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "arrow-array", "arrow-schema", @@ -5014,8 +5016,8 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "7.0.0-beta.13" -source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1" +version = "7.0.0-rc.1" +source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af" dependencies = [ "jieba-rs", "lindera", diff --git a/Cargo.toml b/Cargo.toml index 1c87a4f25..f829a878a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,20 +13,20 @@ categories = ["database-implementations"] rust-version = "1.91.0" [workspace.dependencies] -lance = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-core = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-datagen = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-file = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-io = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-index = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-linalg = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-namespace = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-namespace-impls = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-table = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-testing = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-datafusion = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-encoding = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } -lance-arrow = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" } +lance = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-core = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-datagen = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-file = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-io = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-index = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-linalg = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-namespace = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-namespace-impls = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-table = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-testing = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-datafusion = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-encoding = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } +lance-arrow = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" } ahash = "0.8" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false } diff --git a/nodejs/__test__/connection.test.ts b/nodejs/__test__/connection.test.ts index d195a0ca4..68180471a 100644 --- a/nodejs/__test__/connection.test.ts +++ b/nodejs/__test__/connection.test.ts @@ -171,18 +171,22 @@ describe("given a connection", () => { let manifestDir = tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions"; - readdirSync(manifestDir).forEach((file) => { - expect(file).toMatch(/^\d{20}\.manifest$/); - }); + readdirSync(manifestDir) + .filter((f) => f.endsWith(".manifest")) + .forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); table = (await db.createTable("test_manifest_paths_v2", [{ id: 1 }], { enableV2ManifestPaths: true, })) as LocalTable; expect(await table.usesV2ManifestPaths()).toBe(true); manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions"; - readdirSync(manifestDir).forEach((file) => { - expect(file).toMatch(/^\d{20}\.manifest$/); - }); + readdirSync(manifestDir) + .filter((f) => f.endsWith(".manifest")) + .forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); }); it("should be able to migrate tables to the V2 manifest paths", async () => { @@ -199,16 +203,20 @@ describe("given a connection", () => { const manifestDir = tmpDir.name + "/test_manifest_path_migration.lance/_versions"; - readdirSync(manifestDir).forEach((file) => { - expect(file).toMatch(/^\d\.manifest$/); - }); + readdirSync(manifestDir) + .filter((f) => f.endsWith(".manifest")) + .forEach((file) => { + expect(file).toMatch(/^\d\.manifest$/); + }); await table.migrateManifestPathsV2(); expect(await table.usesV2ManifestPaths()).toBe(true); - readdirSync(manifestDir).forEach((file) => { - expect(file).toMatch(/^\d{20}\.manifest$/); - }); + readdirSync(manifestDir) + .filter((f) => f.endsWith(".manifest")) + .forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); }); }); diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py index d3db372de..9495fb330 100644 --- a/python/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -466,7 +466,8 @@ async def test_create_table_v2_manifest_paths_async(tmp_path): assert await tbl.uses_v2_manifest_paths() manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions" for manifest in os.listdir(manifests_dir): - assert re.match(r"\d{20}\.manifest", manifest) + if manifest.endswith(".manifest"): + assert re.match(r"\d{20}\.manifest", manifest) # Start a table in V1 mode then migrate tbl = await db_no_v2_paths.create_table( @@ -476,13 +477,15 @@ async def test_create_table_v2_manifest_paths_async(tmp_path): assert not await tbl.uses_v2_manifest_paths() manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions" for manifest in os.listdir(manifests_dir): - assert re.match(r"\d\.manifest", manifest) + if manifest.endswith(".manifest"): + assert re.match(r"\d\.manifest", manifest) await tbl.migrate_manifest_paths_v2() assert await tbl.uses_v2_manifest_paths() for manifest in os.listdir(manifests_dir): - assert re.match(r"\d{20}\.manifest", manifest) + if manifest.endswith(".manifest"): + assert re.match(r"\d{20}\.manifest", manifest) @pytest.mark.asyncio diff --git a/python/python/tests/test_lsm_write_spec.py b/python/python/tests/test_lsm_write_spec.py index b81153994..d9d75d3a9 100644 --- a/python/python/tests/test_lsm_write_spec.py +++ b/python/python/tests/test_lsm_write_spec.py @@ -40,16 +40,6 @@ def _make_table(tmp_path): def test_set_lsm_write_spec_validates(tmp_path): _db, table = _make_table(tmp_path) - # No PK set yet. - with pytest.raises(Exception, match="primary key"): - table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4)) - - table.set_unenforced_primary_key("id") - - # Column mismatch. - with pytest.raises(Exception, match="match"): - table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4)) - # Out-of-range num_buckets. with pytest.raises(Exception, match="num_buckets"): table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0)) @@ -70,7 +60,6 @@ def test_unset_lsm_write_spec(tmp_path): table.unset_lsm_write_spec() # Install a spec, then remove it; afterwards a fresh spec can be set. - table.set_unenforced_primary_key("id") table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4)) table.unset_lsm_write_spec() # A second unset errors — there is no spec left to remove. diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index d4e8d3be3..60debbd22 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -312,17 +312,15 @@ pub use self::merge::MergeResult; /// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default /// `ShardWriter` configuration recorded in the MemWAL index). /// -/// All variants require the table to have an unenforced primary key. -/// /// Install a spec with [`Table::set_lsm_write_spec`] and remove it with /// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch /// onto the MemWAL writer is a follow-up. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum LsmWriteSpec { - /// Hash-bucket sharding by the unenforced primary key column. + /// Hash-bucket sharding by a scalar column. /// - /// `column` must equal the table's currently-set single-column - /// unenforced primary key. `num_buckets` must be in `[1, 1024]`. + /// `column` must be a non-nested column with a supported scalar type. + /// `num_buckets` must be in `[1, 1024]`. /// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's /// `bucket(column, num_buckets)` value is stable across processes. Bucket { @@ -1360,21 +1358,15 @@ impl Table { /// /// [`LsmWriteSpec`] chooses one of three sharding strategies: /// - /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column - /// unenforced primary key. + /// - [`LsmWriteSpec::bucket`] — hash-bucket writes by a scalar column. /// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column. /// - [`LsmWriteSpec::unsharded`] — route every write to a single shard. /// - /// All variants require the table to have an unenforced primary key - /// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally - /// requires it to be the single column being bucketed. - /// /// # Example /// /// ``` /// # use lancedb::table::{LsmWriteSpec, Table}; /// # async fn example(table: &Table) -> Result<(), Box> { - /// table.set_unenforced_primary_key(["id"]).await?; /// table /// .set_lsm_write_spec( /// LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]), @@ -4661,21 +4653,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - // Reject when no PK is set. - let err = table - .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4)) - .await - .expect_err("should reject without PK"); - assert!(matches!(err, Error::Lance { .. }), "got {:?}", err); - - // Set PK, then a mismatched column on the spec must be rejected. - table.set_unenforced_primary_key(["id"]).await.unwrap(); - let err = table - .set_lsm_write_spec(LsmWriteSpec::bucket("name", 4)) - .await - .expect_err("should reject column != PK"); - assert!(matches!(err, Error::Lance { .. }), "got {:?}", err); - // Reject num_buckets out of range. for bad in [0u32, 1025] { let err = table @@ -4741,9 +4718,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - // Lance's MemWAL still requires *some* unenforced primary key on - // the dataset; Unsharded just skips the per-row hashing step. - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec(LsmWriteSpec::unsharded()) .await @@ -4790,7 +4764,6 @@ mod tests { .unwrap(); let table = conn.create_table("t", reader).execute().await.unwrap(); - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec( LsmWriteSpec::identity("region") @@ -4846,7 +4819,6 @@ mod tests { table.unset_lsm_write_spec().await.unwrap_err(); // Install a spec, then unset it. - table.set_unenforced_primary_key(["id"]).await.unwrap(); table .set_lsm_write_spec(LsmWriteSpec::bucket("id", 4)) .await