diff --git a/Cargo.lock b/Cargo.lock index 930e1494a..6c6da7cb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -383,7 +383,7 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" dependencies = [ - "bitflags", + "bitflags 2.11.0", "serde_core", "serde_json", ] @@ -1204,6 +1204,26 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -1219,6 +1239,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -1467,6 +1493,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -1539,7 +1574,7 @@ checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" dependencies = [ "parse-zoneinfo", "phf 0.11.3", - "phf_codegen", + "phf_codegen 0.11.3", ] [[package]] @@ -1889,7 +1924,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ - "bitflags", + "bitflags 2.11.0", "crossterm_winapi", "document-features", "parking_lot", @@ -2910,6 +2945,70 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -2919,6 +3018,15 @@ dependencies = [ "cfg-if 1.0.4", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "enum-as-inner" version = "0.6.1" @@ -3074,6 +3182,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -3092,7 +3211,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "rustc_version", ] @@ -4093,7 +4212,7 @@ version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if 1.0.4", "libc", ] @@ -4168,6 +4287,28 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" +[[package]] +name = "jieba-macros" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a29cfc5dcd898604c6f80363411fa6b6b08e27d1d253d6225b9cb6702ea02fc0" +dependencies = [ + "phf_codegen 0.13.1", +] + +[[package]] +name = "jieba-rs" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3245d6e9d1d5facbd6a23848d6b67e3439738ccbb4fa5a3d65da315ba1a910a2" +dependencies = [ + "cedarwood", + "jieba-macros", + "phf 0.13.1", + "regex", + "rustc-hash", +] + [[package]] name = "jiff" version = "0.2.23" @@ -4317,6 +4458,15 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "konst" version = "0.4.3" @@ -4555,7 +4705,7 @@ dependencies = [ "prost-types", "rand 0.9.4", "snafu 0.9.0", - "strum", + "strum 0.26.3", "tokio", "tracing", "xxhash-rust", @@ -4626,6 +4776,7 @@ dependencies = [ "futures", "half", "itertools 0.13.0", + "jieba-rs", "jsonb", "lance-arrow", "lance-core", @@ -4845,6 +4996,8 @@ name = "lance-tokenizer" version = "7.0.0-beta.7" source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.7#f6932459689b5568c89baa435ff85a4abf067b45" dependencies = [ + "jieba-rs", + "lindera", "rust-stemmers", "serde", "unicode-normalization", @@ -5080,7 +5233,134 @@ version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ + "bitflags 2.11.0", "libc", + "plain", + "redox_syscall 0.7.4", +] + +[[package]] +name = "lindera" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50aba4ef41052280722f2120f65606b9218e8718032a3c752b953c4d8091f02e" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-cc-cedict", + "lindera-dictionary", + "lindera-ipadic", + "lindera-ipadic-neologd", + "lindera-ko-dic", + "lindera-unidic", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum 0.27.2", + "strum_macros 0.27.2", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-cc-cedict" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d77e7a0830fd60f23828ad914439997288c1d2cdd9e269be67f967c27b56350" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-dictionary" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "489cc70922782af3fd397c0e130846caefe1c15b27c2211aac8f88a9f4590aaf" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "md5", + "memmap2 0.9.10", + "once_cell", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "tar", + "thiserror 2.0.18", + "tokio", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78870521431dfaf0f94ddd3484fa08367e9d354fc8c708572f2f00007225ddfa" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ipadic-neologd" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abcb3dc3056e5c683e12c2c5e8d40076f7ecfd7bd46f5fc0e4ae9e58152b5d85" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e99316158bab14f0256d912055521ca784f76c63e7460db8a74775c5dc1f8bc2" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-unidic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52672945166c14276bbba25e4ec79d7e126db1b503c0a6aa07ffc0141ae15cfa" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", ] [[package]] @@ -5237,6 +5517,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae960838283323069879657ca3de837e9f7bbb4c7bf6ea7f1b290d5e9476d2e0" + [[package]] name = "mea" version = "0.6.3" @@ -5402,7 +5688,7 @@ version = "3.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6944d0bf100571cd6e1a98a316cdca262deb6fccf8d93f5ae1502ca3fc88bd3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "ctor", "futures", "napi-build", @@ -5615,7 +5901,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags", + "bitflags 2.11.0", ] [[package]] @@ -5727,7 +6013,7 @@ version = "6.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" dependencies = [ - "bitflags", + "bitflags 2.11.0", "libc", "once_cell", "onig_sys", @@ -6041,7 +6327,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if 1.0.4", "libc", - "redox_syscall", + "redox_syscall 0.5.18", "smallvec", "windows-link", ] @@ -6150,16 +6436,36 @@ dependencies = [ "phf_shared 0.12.1", ] +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared 0.13.1", + "serde", +] + [[package]] name = "phf_codegen" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator", + "phf_generator 0.11.3", "phf_shared 0.11.3", ] +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + [[package]] name = "phf_generator" version = "0.11.3" @@ -6170,6 +6476,16 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -6188,6 +6504,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.11" @@ -6274,6 +6599,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "planus" version = "0.3.1" @@ -6371,7 +6702,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "465f70d3e96b6d0b1a43c358ba451286b8c8bd56696feff020d65702aa33e35c" dependencies = [ "ahash", - "bitflags", + "bitflags 2.11.0", "bytemuck", "chrono", "chrono-tz 0.8.6", @@ -6445,7 +6776,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2632b1af668e2058d5f8f916d8fbde3cac63d03ae29a705f598e41dcfeb7f" dependencies = [ "ahash", - "bitflags", + "bitflags 2.11.0", "glob", "once_cell", "polars-arrow", @@ -7114,7 +7445,7 @@ version = "11.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" dependencies = [ - "bitflags", + "bitflags 2.11.0", ] [[package]] @@ -7195,7 +7526,16 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", +] + +[[package]] +name = "redox_syscall" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +dependencies = [ + "bitflags 2.11.0", ] [[package]] @@ -7626,7 +7966,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -7875,7 +8215,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags", + "bitflags 2.11.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -8021,6 +8361,19 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.13.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" @@ -8402,6 +8755,15 @@ dependencies = [ "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", +] + [[package]] name = "strum_macros" version = "0.25.3" @@ -8428,6 +8790,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "subtle" version = "2.6.1" @@ -8488,7 +8862,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" dependencies = [ - "bitflags", + "bitflags 2.11.0", "byteorder", "enum-as-inner", "libc", @@ -8530,7 +8904,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags", + "bitflags 2.11.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -8557,6 +8931,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "target-features" version = "0.1.6" @@ -8895,7 +9280,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags", + "bitflags 2.11.0", "bytes", "http 1.4.0", "http-body 1.0.1", @@ -8913,7 +9298,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ "async-compression", - "bitflags", + "bitflags 2.11.0", "bytes", "futures-core", "futures-util", @@ -9068,6 +9453,12 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -9125,6 +9516,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.7.1" @@ -9137,6 +9534,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "ureq" version = "2.12.1" @@ -9216,6 +9619,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "vsimd" version = "0.8.0" @@ -9396,7 +9805,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap 2.13.0", "semver", @@ -9865,7 +10274,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap 2.13.0", "log", "serde", @@ -9910,6 +10319,16 @@ dependencies = [ "tap", ] +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + [[package]] name = "xet-client" version = "1.5.2" @@ -10069,6 +10488,12 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yoke" version = "0.8.1" diff --git a/deny.toml b/deny.toml index 85231f920..cbecdb902 100644 --- a/deny.toml +++ b/deny.toml @@ -51,6 +51,18 @@ ignore = [ # https://rustsec.org/advisories/RUSTSEC-2024-0436 { id = "RUSTSEC-2024-0436", reason = "transitive via datafusion; awaiting ecosystem migration" }, + # encoding: unmaintained. Reached through lindera-dictionary, which is + # required by the native Lindera tokenizer path. Lindera has not migrated + # off this crate yet. + # https://rustsec.org/advisories/RUSTSEC-2021-0153 + { id = "RUSTSEC-2021-0153", reason = "transitive via lindera-dictionary for native Lindera tokenizer" }, + + # fast-float: unsound and unmaintained. Reached only through polars-arrow + # from the optional Polars integration; replacement requires a Polars + # dependency upgrade. + # https://rustsec.org/advisories/RUSTSEC-2024-0379 + { id = "RUSTSEC-2024-0379", reason = "transitive via polars-arrow; waiting on Polars migration" }, + # tantivy: segfault on malformed input due to missing bounds check. # Pulled in via lance for full-text search. We only feed tantivy # documents we construct ourselves, not attacker-controlled bytes. @@ -68,11 +80,17 @@ ignore = [ # https://rustsec.org/advisories/RUSTSEC-2025-0119 { id = "RUSTSEC-2025-0119", reason = "transitive via hf-hub/indicatif; cosmetic formatting crate" }, - # rustls-pemfile: unmaintained. Reached from two separate chains: - # rustls-native-certs 0.6 (via hyper-rustls 0.24) and object_store 0.12. - # Both upstream dependencies need to move before we can drop it. - # https://rustsec.org/advisories/RUSTSEC-2025-0134 - { id = "RUSTSEC-2025-0134", reason = "transitive via rustls-native-certs/object_store; waiting on upstream migration" }, + # bincode: unmaintained. Reached through lindera and lindera-dictionary, + # which are required by the native Lindera tokenizer path. Lindera has not + # migrated to another serialization format yet. + # https://rustsec.org/advisories/RUSTSEC-2025-0141 + { id = "RUSTSEC-2025-0141", reason = "transitive via lindera/lindera-dictionary for native Lindera tokenizer" }, + + # lru: soundness issue in IterMut. Reached only through aws-sdk-s3 in + # LanceDB's dev-dependency graph; LanceDB does not use that iterator + # directly. Clearing this requires the AWS SDK chain to update lru. + # https://rustsec.org/advisories/RUSTSEC-2026-0002 + { id = "RUSTSEC-2026-0002", reason = "transitive via aws-sdk-s3 dev-dependency; waiting on AWS SDK lru upgrade" }, # rustls-webpki 0.101.7 (old major line): name-constraint checks for # URI / wildcard names. Pulled in only via the legacy rustls 0.21 chain @@ -89,6 +107,12 @@ ignore = [ # we actively use is upgraded to 0.103.13 which contains the fix. # https://rustsec.org/advisories/RUSTSEC-2026-0104 { id = "RUSTSEC-2026-0104", reason = "only affects rustls-webpki 0.101 from legacy aws-smithy/rustls 0.21 chain" }, + + # rand 0.8.5: soundness issue only when ThreadRng reseeds inside a custom + # logger. Reached through several transitive chains. LanceDB does not use + # rand from a custom logger; upgrade once all pinned chains accept 0.8.6+. + # https://rustsec.org/advisories/RUSTSEC-2026-0097 + { id = "RUSTSEC-2026-0097", reason = "transitive rand 0.8.5; LanceDB does not call ThreadRng from custom logging" }, ] # --------------------------------------------------------------------------- diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 5f4236c83..6dca95ed8 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and ## Full text search -::: lancedb.fts.create_index +Use [lancedb.table.Table.create_fts_index][] for the synchronous API or +[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the +asynchronous API. -::: lancedb.fts.populate_index - -::: lancedb.fts.search_index +::: lancedb.index.FTS ## Utilities diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index d5e4bfcdc..4fbffc50d 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -7,6 +7,7 @@ from typing import Literal, Optional from ._lancedb import ( IndexConfig, ) +from .types import BaseTokenizerType lang_mapping = { "ar": "Arabic", @@ -111,8 +112,12 @@ class FTS: - "simple": Splits text by whitespace and punctuation. - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. + - "ngram": N-gram tokenizer for substring-style matching. + - "jieba/*": Jieba tokenizer loaded from Lance's language model home. + - "lindera/*": Lindera tokenizer loaded from Lance's language model home. language : str, default "English" - The language to use for tokenization. + The language to use for stemming and stop-word removal. This is not the + primary way to enable CJK tokenization. max_token_length : int, default 40 The maximum token length to index. Tokens longer than this length will be ignored. @@ -127,10 +132,17 @@ class FTS: ascii_folding : bool, default True Whether to fold ASCII characters. This converts accented characters to their ASCII equivalent. For example, "café" would be converted to "cafe". + + Notes + ----- + Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` + require tokenizer models in Lance's language model home. Set + ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data + directory under ``lance/language_models``. """ with_position: bool = False - base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple" + base_tokenizer: BaseTokenizerType = "simple" language: str = "English" max_token_length: Optional[int] = 40 lower_case: bool = True diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 905e1481a..a33166937 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -39,6 +39,7 @@ from lancedb.table import _normalize_progress from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder from ..table import AsyncTable, IndexStatistics, Query, Table, Tags +from ..types import BaseTokenizerType class RemoteTable(Table): @@ -167,7 +168,7 @@ class RemoteTable(Table): wait_timeout: Optional[timedelta] = None, with_position: bool = False, # tokenizer configs: - base_tokenizer: str = "simple", + base_tokenizer: BaseTokenizerType = "simple", language: str = "English", max_token_length: Optional[int] = 40, lower_case: bool = True, diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index d1321b69a..82768197c 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -86,6 +86,59 @@ from .util import ( ) from .index import lang_mapping +_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera") +_MODEL_BACKED_TOKENIZER_ERRORS = ( + "unknown base tokenizer", + "Invalid directory path:", + "Failed to load Jieba", + "Failed to load tokenizer config", + "Failed to initialize default tokenizer", +) + + +def _add_unique_note(exception: BaseException, note: str) -> None: + existing_notes = getattr(exception, "__notes__", ()) or () + message = ( + exception.args[0] + if exception.args and isinstance(exception.args[0], str) + else "" + ) + if note not in existing_notes and note not in message: + add_note(exception, note) + + +def _is_model_backed_tokenizer(base_tokenizer: str) -> bool: + return any( + base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/") + for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES + ) + + +def _maybe_add_fts_error_note( + exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None +) -> None: + message = str(exception) + if language is not None and "not support the requested language" in message: + supported_langs = ", ".join(lang_mapping.values()) + _add_unique_note(exception, f"Supported languages: {supported_langs}") + return + + if not _is_model_backed_tokenizer(base_tokenizer): + return + + if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS): + return + + _add_unique_note( + exception, + "Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' " + "require tokenizer models in Lance's language model home. Set " + "LANCE_LANGUAGE_MODEL_HOME to override the default platform data " + "directory under 'lance/language_models'. Expected layouts include " + "'/jieba/default/...' and " + "'/lindera/ipadic/...'.", + ) + if TYPE_CHECKING: from .db import LanceDBConnection @@ -958,7 +1011,10 @@ class Table(ABC): tokenizer_name: str, default "default" A compatibility alias for native tokenizer configs. Can be "raw", "default" or the 2 letter language code followed by "_stem". So - for english it would be "en_stem". + for english it would be "en_stem". For new native FTS indexes, use + ``base_tokenizer`` directly; ``tokenizer_name`` is a legacy + compatibility alias and does not expose model-backed tokenizer names + such as ``jieba/default`` or ``lindera/ipadic``. use_tantivy: bool, default False Deprecated legacy Tantivy parameter. Setting this to True raises an error. @@ -972,8 +1028,11 @@ class Table(ABC): - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. - "ngram": N-Gram tokenizer. + - "jieba/*": Jieba tokenizer loaded from Lance's language model home. + - "lindera/*": Lindera tokenizer loaded from Lance's language model home. language : str, default "English" - The language to use for tokenization. + The language to use for stemming and stop-word removal. This is not + the primary way to enable CJK tokenization. max_token_length : int, default 40 The maximum token length to index. Tokens longer than this length will be ignored. @@ -999,6 +1058,13 @@ class Table(ABC): The timeout to wait if indexing is asynchronous. name: str, optional The name of the index. If not provided, a default name will be generated. + + Notes + ----- + Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` + require tokenizer models in Lance's language model home. Set + ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data + directory under ``lance/language_models``. """ raise NotImplementedError @@ -2462,14 +2528,22 @@ class LanceTable(Table): **tokenizer_configs, ) - LOOP.run( - self._table.create_index( - field_names, - replace=replace, - config=config, - name=name, + try: + LOOP.run( + self._table.create_index( + field_names, + replace=replace, + config=config, + name=name, + ) ) - ) + except (ValueError, RuntimeError) as e: + _maybe_add_fts_error_note( + e, + base_tokenizer=config.base_tokenizer, + language=config.language, + ) + raise e @staticmethod def infer_tokenizer_configs(tokenizer_name: str) -> dict: @@ -3865,11 +3939,13 @@ class AsyncTable: name=name, train=train, ) - except ValueError as e: - if "not support the requested language" in str(e): - supported_langs = ", ".join(lang_mapping.values()) - help_msg = f"Supported languages: {supported_langs}" - add_note(e, help_msg) + except (ValueError, RuntimeError) as e: + if isinstance(config, FTS): + _maybe_add_fts_error_note( + e, + base_tokenizer=config.base_tokenizer, + language=config.language, + ) raise e async def drop_index(self, name: str) -> None: diff --git a/python/python/lancedb/types.py b/python/python/lancedb/types.py index e7b185f2b..2e26e5630 100644 --- a/python/python/lancedb/types.py +++ b/python/python/lancedb/types.py @@ -40,4 +40,5 @@ IndexType = Literal[ ] # Tokenizer literals -BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"] +BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"] +BaseTokenizerType = BuiltinTokenizerType | str diff --git a/python/python/tests/models/jieba/default/dict.txt b/python/python/tests/models/jieba/default/dict.txt new file mode 100644 index 000000000..237b47ca6 --- /dev/null +++ b/python/python/tests/models/jieba/default/dict.txt @@ -0,0 +1,8 @@ +我们 98740 r +都 202780 d +有 423765 v +光明 1219 n +的 318825 uj +前途 1263 n +前 62779 f +途 857 n diff --git a/python/python/tests/models/lindera/ipadic/config.yml b/python/python/tests/models/lindera/ipadic/config.yml new file mode 100644 index 000000000..6b3f0af40 --- /dev/null +++ b/python/python/tests/models/lindera/ipadic/config.yml @@ -0,0 +1,4 @@ +segmenter: + mode: "normal" + dictionary: + path: "./python/tests/models/lindera/ipadic/main" \ No newline at end of file diff --git a/python/python/tests/models/lindera/ipadic/main.zip b/python/python/tests/models/lindera/ipadic/main.zip new file mode 100644 index 000000000..4db1bab93 Binary files /dev/null and b/python/python/tests/models/lindera/ipadic/main.zip differ diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 57f2db85d..5fa64c14e 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -15,7 +15,10 @@ # limitations under the License. import os import random +import shutil from unittest import mock +from pathlib import Path +import zipfile import lancedb as ldb from lancedb.db import DBConnection @@ -36,6 +39,8 @@ import pytest import pytest_asyncio from utils import exception_output +TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models" + @pytest.fixture def table(tmp_path) -> ldb.table.LanceTable: @@ -89,6 +94,40 @@ def table(tmp_path) -> ldb.table.LanceTable: return table +@pytest.fixture +def language_model_home(monkeypatch, tmp_path): + model_home = tmp_path / "language-models" + shutil.copytree(TEST_LANGUAGE_MODEL_HOME, model_home) + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(model_home)) + return model_home + + +@pytest.fixture +def lindera_ipadic(language_model_home): + model_path = language_model_home / "lindera" / "ipadic" + extracted_model = model_path / "main" + config_path = model_path / "config.yml" + + if extracted_model.exists(): + shutil.rmtree(extracted_model) + + with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref: + zip_ref.extractall(model_path) + config_path.write_text( + "segmenter:\n" + ' mode: "normal"\n' + " dictionary:\n" + f' path: "{extracted_model.resolve().as_posix()}"\n', + encoding="utf-8", + ) + + try: + yield + finally: + if extracted_model.exists(): + shutil.rmtree(extracted_model) + + @pytest_asyncio.fixture async def async_table(tmp_path) -> ldb.table.AsyncTable: # Use local random state to avoid affecting other tests @@ -684,6 +723,90 @@ def test_fts_ngram(mem_db: DBConnection): assert set(r["text"] for r in results) == {"lance database", "lance is cool"} +def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home): + data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]}) + table = mem_db.create_table("test_jieba", data=data) + table.create_fts_index( + "text", + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + results = table.search("我们", query_type="fts").limit(10).to_list() + assert [row["text"] for row in results] == ["我们都有光明的前途"] + + +def test_fts_jieba_missing_language_model_note( + mem_db: DBConnection, monkeypatch, tmp_path +): + missing_root = tmp_path / "missing-language-models" + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root)) + table = mem_db.create_table( + "test_missing_jieba_model", + data=pa.table({"text": ["我们都有光明的前途"]}), + ) + + with pytest.raises((ValueError, RuntimeError)) as e: + table.create_fts_index( + "text", + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + output = exception_output(e) + assert "Invalid directory path:" in output + assert "LANCE_LANGUAGE_MODEL_HOME" in output + assert "jieba/default" in output + + +@pytest.mark.asyncio +async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path): + missing_root = tmp_path / "missing-language-models" + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root)) + db = await ldb.connect_async(tmp_path / "async-db") + table = await db.create_table( + "test_missing_jieba_model_async", + data=pa.table({"text": ["我们都有光明的前途"]}), + ) + + with pytest.raises((ValueError, RuntimeError)) as e: + await table.create_index( + "text", + config=FTS( + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ), + ) + + output = exception_output(e) + assert "Invalid directory path:" in output + assert "LANCE_LANGUAGE_MODEL_HOME" in output + assert "jieba/default" in output + + +def test_fts_lindera_tokenizer( + mem_db: DBConnection, language_model_home, lindera_ipadic +): + data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]}) + table = mem_db.create_table("test_lindera", data=data) + table.create_fts_index( + "text", + base_tokenizer="lindera/ipadic", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + results = table.search("成田", query_type="fts").limit(10).to_list() + assert [row["text"] for row in results] == ["成田国際空港"] + + def test_fts_query_to_json(): """Test that FTS query to_json() produces valid JSON strings with exact format.""" diff --git a/rust/lancedb/Cargo.toml b/rust/lancedb/Cargo.toml index 292226ede..b05302b9b 100644 --- a/rust/lancedb/Cargo.toml +++ b/rust/lancedb/Cargo.toml @@ -40,7 +40,7 @@ lance-datafusion.workspace = true lance-datagen = { workspace = true } lance-file = { workspace = true } lance-io = { workspace = true } -lance-index = { workspace = true } +lance-index = { workspace = true, features = ["tokenizer-jieba", "tokenizer-lindera"] } lance-table = { workspace = true } lance-linalg = { workspace = true } lance-testing = { workspace = true }