From f54842ccafe5b7ff59b35e2a80a0db441b3a4ad6 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 20 Apr 2026 10:03:16 +0800 Subject: [PATCH] feat(python): support model-backed native FTS tokenizers --- Cargo.lock | 469 +++++++++++++++++- docs/src/python/python.md | 8 +- python/python/lancedb/index.py | 14 +- python/python/lancedb/remote/table.py | 3 +- python/python/lancedb/table.py | 93 +++- python/python/lancedb/types.py | 3 +- .../tests/models/jieba/default/dict.txt | 8 + .../tests/models/lindera/ipadic/config.yml | 4 + .../tests/models/lindera/ipadic/main.zip | Bin 0 -> 2450 bytes python/python/tests/test_fts.py | 113 +++++ rust/lancedb/Cargo.toml | 2 +- 11 files changed, 672 insertions(+), 45 deletions(-) create mode 100644 python/python/tests/models/jieba/default/dict.txt create mode 100644 python/python/tests/models/lindera/ipadic/config.yml create mode 100644 python/python/tests/models/lindera/ipadic/main.zip diff --git a/Cargo.lock b/Cargo.lock index a749b0208..ee87b9f25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -365,7 +365,7 @@ version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" dependencies = [ - "bitflags", + "bitflags 2.11.0", "serde_core", "serde_json", ] @@ -1186,6 +1186,26 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -1201,6 +1221,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -1438,6 +1464,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -1493,7 +1528,7 @@ checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" dependencies = [ "parse-zoneinfo", "phf 0.11.3", - "phf_codegen", + "phf_codegen 0.11.3", ] [[package]] @@ -1751,7 +1786,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ - "bitflags", + "bitflags 2.11.0", "crossterm_winapi", "document-features", "parking_lot", @@ -2761,6 +2796,70 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -2770,6 +2869,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "enum-as-inner" version = "0.6.1" @@ -2925,6 +3033,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -2943,7 +3062,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "rustc_version", ] @@ -3893,7 +4012,7 @@ version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "libc", ] @@ -3968,6 +4087,28 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" +[[package]] +name = "jieba-macros" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a29cfc5dcd898604c6f80363411fa6b6b08e27d1d253d6225b9cb6702ea02fc0" +dependencies = [ + "phf_codegen 0.13.1", +] + +[[package]] +name = "jieba-rs" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3245d6e9d1d5facbd6a23848d6b67e3439738ccbb4fa5a3d65da315ba1a910a2" +dependencies = [ + "cedarwood", + "jieba-macros", + "phf 0.13.1", + "regex", + "rustc-hash", +] + [[package]] name = "jiff" version = "0.2.23" @@ -4064,6 +4205,15 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "lance" version = "6.0.0-beta.1" @@ -4285,7 +4435,7 @@ dependencies = [ "prost-types", "rand 0.9.2", "snafu 0.9.0", - "strum", + "strum 0.26.3", "tokio", "tracing", "xxhash-rust", @@ -4355,6 +4505,7 @@ dependencies = [ "futures", "half", "itertools 0.13.0", + "jieba-rs", "jsonb", "lance-arrow", "lance-core", @@ -4569,6 +4720,8 @@ name = "lance-tokenizer" version = "6.0.0-beta.1" source = "git+https://github.com/lance-format/lance.git?tag=v6.0.0-beta.1#c7a7d3a0e944646e793d297d4a2e2cf7e4fb28a3" dependencies = [ + "jieba-rs", + "lindera", "rust-stemmers", "serde", "unicode-normalization", @@ -4803,7 +4956,134 @@ version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ + "bitflags 2.11.0", "libc", + "plain", + "redox_syscall 0.7.4", +] + +[[package]] +name = "lindera" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50aba4ef41052280722f2120f65606b9218e8718032a3c752b953c4d8091f02e" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-cc-cedict", + "lindera-dictionary", + "lindera-ipadic", + "lindera-ipadic-neologd", + "lindera-ko-dic", + "lindera-unidic", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum 0.27.2", + "strum_macros 0.27.2", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-cc-cedict" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d77e7a0830fd60f23828ad914439997288c1d2cdd9e269be67f967c27b56350" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-dictionary" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "489cc70922782af3fd397c0e130846caefe1c15b27c2211aac8f88a9f4590aaf" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "md5", + "memmap2 0.9.10", + "once_cell", + "rand 0.9.2", + "reqwest", + "serde", + "tar", + "thiserror 2.0.18", + "tokio", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78870521431dfaf0f94ddd3484fa08367e9d354fc8c708572f2f00007225ddfa" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ipadic-neologd" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abcb3dc3056e5c683e12c2c5e8d40076f7ecfd7bd46f5fc0e4ae9e58152b5d85" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e99316158bab14f0256d912055521ca784f76c63e7460db8a74775c5dc1f8bc2" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-unidic" +version = "0.44.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52672945166c14276bbba25e4ec79d7e126db1b503c0a6aa07ffc0141ae15cfa" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", ] [[package]] @@ -4960,6 +5240,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae960838283323069879657ca3de837e9f7bbb4c7bf6ea7f1b290d5e9476d2e0" + [[package]] name = "memchr" version = "2.8.0" @@ -5119,7 +5405,7 @@ version = "3.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6944d0bf100571cd6e1a98a316cdca262deb6fccf8d93f5ae1502ca3fc88bd3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "ctor", "futures", "napi-build", @@ -5407,7 +5693,7 @@ version = "6.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" dependencies = [ - "bitflags", + "bitflags 2.11.0", "libc", "once_cell", "onig_sys", @@ -5525,7 +5811,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.18", "smallvec", "windows-link", ] @@ -5634,16 +5920,36 @@ dependencies = [ "phf_shared 0.12.1", ] +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared 0.13.1", + "serde", +] + [[package]] name = "phf_codegen" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator", + "phf_generator 0.11.3", "phf_shared 0.11.3", ] +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + [[package]] name = "phf_generator" version = "0.11.3" @@ -5654,6 +5960,16 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -5672,6 +5988,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.11" @@ -5758,6 +6083,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "planus" version = "0.3.1" @@ -5855,7 +6186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "465f70d3e96b6d0b1a43c358ba451286b8c8bd56696feff020d65702aa33e35c" dependencies = [ "ahash", - "bitflags", + "bitflags 2.11.0", "bytemuck", "chrono", "chrono-tz 0.8.6", @@ -5929,7 +6260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2632b1af668e2058d5f8f916d8fbde3cac63d03ae29a705f598e41dcfeb7f" dependencies = [ "ahash", - "bitflags", + "bitflags 2.11.0", "glob", "once_cell", "polars-arrow", @@ -6582,7 +6913,7 @@ version = "11.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" dependencies = [ - "bitflags", + "bitflags 2.11.0", ] [[package]] @@ -6654,7 +6985,16 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", +] + +[[package]] +name = "redox_syscall" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +dependencies = [ + "bitflags 2.11.0", ] [[package]] @@ -6950,7 +7290,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -7175,7 +7515,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags", + "bitflags 2.11.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -7321,6 +7661,19 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.13.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" @@ -7661,6 +8014,15 @@ dependencies = [ "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", +] + [[package]] name = "strum_macros" version = "0.25.3" @@ -7687,6 +8049,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "subtle" version = "2.6.1" @@ -7741,7 +8115,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" dependencies = [ - "bitflags", + "bitflags 2.11.0", "byteorder", "enum-as-inner", "libc", @@ -7769,7 +8143,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags", + "bitflags 2.11.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -7796,6 +8170,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "target-features" version = "0.1.6" @@ -8122,7 +8507,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags", + "bitflags 2.11.0", "bytes", "http 1.4.0", "http-body 1.0.1", @@ -8140,7 +8525,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ "async-compression", - "bitflags", + "bitflags 2.11.0", "bytes", "futures-core", "futures-util", @@ -8263,6 +8648,12 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -8326,6 +8717,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.7.1" @@ -8338,6 +8735,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "ureq" version = "2.12.1" @@ -8417,6 +8820,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "vsimd" version = "0.8.0" @@ -8566,7 +8975,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap 2.13.0", "semver", @@ -8962,7 +9371,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap 2.13.0", "log", "serde", @@ -9007,6 +9416,16 @@ dependencies = [ "tap", ] +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + [[package]] name = "xmlparser" version = "0.13.6" @@ -9019,6 +9438,12 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yoke" version = "0.8.1" diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 5f4236c83..6dca95ed8 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and ## Full text search -::: lancedb.fts.create_index +Use [lancedb.table.Table.create_fts_index][] for the synchronous API or +[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the +asynchronous API. -::: lancedb.fts.populate_index - -::: lancedb.fts.search_index +::: lancedb.index.FTS ## Utilities diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index d5e4bfcdc..73daf57b4 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -7,6 +7,7 @@ from typing import Literal, Optional from ._lancedb import ( IndexConfig, ) +from .types import BaseTokenizerType lang_mapping = { "ar": "Arabic", @@ -111,8 +112,12 @@ class FTS: - "simple": Splits text by whitespace and punctuation. - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. + - "ngram": N-gram tokenizer for substring-style matching. + - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. + - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. language : str, default "English" - The language to use for tokenization. + The language to use for stemming and stop-word removal. This is not the + primary way to enable CJK tokenization. max_token_length : int, default 40 The maximum token length to index. Tokens longer than this length will be ignored. @@ -127,10 +132,15 @@ class FTS: ascii_folding : bool, default True Whether to fold ASCII characters. This converts accented characters to their ASCII equivalent. For example, "café" would be converted to "cafe". + + Notes + ----- + Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` + require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``. """ with_position: bool = False - base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple" + base_tokenizer: BaseTokenizerType = "simple" language: str = "English" max_token_length: Optional[int] = 40 lower_case: bool = True diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 905e1481a..a33166937 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -39,6 +39,7 @@ from lancedb.table import _normalize_progress from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder from ..table import AsyncTable, IndexStatistics, Query, Table, Tags +from ..types import BaseTokenizerType class RemoteTable(Table): @@ -167,7 +168,7 @@ class RemoteTable(Table): wait_timeout: Optional[timedelta] = None, with_position: bool = False, # tokenizer configs: - base_tokenizer: str = "simple", + base_tokenizer: BaseTokenizerType = "simple", language: str = "English", max_token_length: Optional[int] = 40, lower_case: bool = True, diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index d1321b69a..7600011e7 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -86,6 +86,52 @@ from .util import ( ) from .index import lang_mapping +_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera") +_MODEL_BACKED_TOKENIZER_ERRORS = ( + "unknown base tokenizer", + "Invalid directory path:", + "Failed to load Jieba", + "Failed to load tokenizer config", + "Failed to initialize default tokenizer", +) + + +def _add_unique_note(exception: BaseException, note: str) -> None: + existing_notes = getattr(exception, "__notes__", ()) or () + if note not in existing_notes: + add_note(exception, note) + + +def _is_model_backed_tokenizer(base_tokenizer: str) -> bool: + return any( + base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/") + for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES + ) + + +def _maybe_add_fts_error_note( + exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None +) -> None: + message = str(exception) + if language is not None and "not support the requested language" in message: + supported_langs = ", ".join(lang_mapping.values()) + _add_unique_note(exception, f"Supported languages: {supported_langs}") + return + + if not _is_model_backed_tokenizer(base_tokenizer): + return + + if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS): + return + + _add_unique_note( + exception, + "Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' " + "require tokenizer models under LANCE_LANGUAGE_MODEL_HOME. Expected " + "layouts include '$LANCE_LANGUAGE_MODEL_HOME/jieba/default/...' and " + "'$LANCE_LANGUAGE_MODEL_HOME/lindera/ipadic/...'.", + ) + if TYPE_CHECKING: from .db import LanceDBConnection @@ -958,7 +1004,8 @@ class Table(ABC): tokenizer_name: str, default "default" A compatibility alias for native tokenizer configs. Can be "raw", "default" or the 2 letter language code followed by "_stem". So - for english it would be "en_stem". + for english it would be "en_stem". Prefer ``base_tokenizer`` for + new code. use_tantivy: bool, default False Deprecated legacy Tantivy parameter. Setting this to True raises an error. @@ -972,8 +1019,11 @@ class Table(ABC): - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. - "ngram": N-Gram tokenizer. + - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. + - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. language : str, default "English" - The language to use for tokenization. + The language to use for stemming and stop-word removal. This is not + the primary way to enable CJK tokenization. max_token_length : int, default 40 The maximum token length to index. Tokens longer than this length will be ignored. @@ -999,6 +1049,11 @@ class Table(ABC): The timeout to wait if indexing is asynchronous. name: str, optional The name of the index. If not provided, a default name will be generated. + + Notes + ----- + Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` + require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``. """ raise NotImplementedError @@ -2462,14 +2517,22 @@ class LanceTable(Table): **tokenizer_configs, ) - LOOP.run( - self._table.create_index( - field_names, - replace=replace, - config=config, - name=name, + try: + LOOP.run( + self._table.create_index( + field_names, + replace=replace, + config=config, + name=name, + ) ) - ) + except (ValueError, RuntimeError) as e: + _maybe_add_fts_error_note( + e, + base_tokenizer=config.base_tokenizer, + language=config.language, + ) + raise e @staticmethod def infer_tokenizer_configs(tokenizer_name: str) -> dict: @@ -3865,11 +3928,13 @@ class AsyncTable: name=name, train=train, ) - except ValueError as e: - if "not support the requested language" in str(e): - supported_langs = ", ".join(lang_mapping.values()) - help_msg = f"Supported languages: {supported_langs}" - add_note(e, help_msg) + except (ValueError, RuntimeError) as e: + if isinstance(config, FTS): + _maybe_add_fts_error_note( + e, + base_tokenizer=config.base_tokenizer, + language=config.language, + ) raise e async def drop_index(self, name: str) -> None: diff --git a/python/python/lancedb/types.py b/python/python/lancedb/types.py index e7b185f2b..2e26e5630 100644 --- a/python/python/lancedb/types.py +++ b/python/python/lancedb/types.py @@ -40,4 +40,5 @@ IndexType = Literal[ ] # Tokenizer literals -BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"] +BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"] +BaseTokenizerType = BuiltinTokenizerType | str diff --git a/python/python/tests/models/jieba/default/dict.txt b/python/python/tests/models/jieba/default/dict.txt new file mode 100644 index 000000000..237b47ca6 --- /dev/null +++ b/python/python/tests/models/jieba/default/dict.txt @@ -0,0 +1,8 @@ +我们 98740 r +都 202780 d +有 423765 v +光明 1219 n +的 318825 uj +前途 1263 n +前 62779 f +途 857 n diff --git a/python/python/tests/models/lindera/ipadic/config.yml b/python/python/tests/models/lindera/ipadic/config.yml new file mode 100644 index 000000000..6b3f0af40 --- /dev/null +++ b/python/python/tests/models/lindera/ipadic/config.yml @@ -0,0 +1,4 @@ +segmenter: + mode: "normal" + dictionary: + path: "./python/tests/models/lindera/ipadic/main" \ No newline at end of file diff --git a/python/python/tests/models/lindera/ipadic/main.zip b/python/python/tests/models/lindera/ipadic/main.zip new file mode 100644 index 0000000000000000000000000000000000000000..4db1bab93907f925de36e4361792d3b540b1211e GIT binary patch literal 2450 zcmWIWW@h1H00G%e&!fN$D8b4g!;qVpnWrBb!pXpV@tTSKn@tg>`Kl^EQ(J_ zP18%t%)@6L&`XTej4n3&}kI!_O){hOb>eP!tDYo)hWPOIDK|8D2= znEOXO`&-(te$}$9-C23}uU6RezjxiM4>>LUVDqMTMQ6eufwpAdr&WOg!M(rh45enS z;kV^jl-b+4>-wj=8o{h*1E01WTyZ}r^y-bS?5vnTuemSuq_qQbE8}!$J@DK1N8!U= z{!O)00y(d*=l;4-{(6D?dWPVNw(S-DmVXY})V2RM>%N_HHaj|nb&_~w+lEP#t4#ME z**lkQ{e@#SH*_3?=a)XdY|*?~$zSogMt|fM#pkZq6vZQ$#T3K~J7rF!#vBZN#5mFQ z#DWv^51#WC4RGP)d*`^Y*K$ol=>0#XSlWh*V z*BlbBxw@18WxrsEVXx$#1KukPCo1lcICkG%>Hmj!yubV8@4wuuVzlAR+Jvb2*Kb&# zIS`e;zL2Z-#qn4TpNE$F7WV#+aQG6tA?11E+s`?(@{d2-lelL>VazuR`}7sF&HBFU zN*~jfJnsAV^1JF=7TLC!t9&1`-+jO0_3_WelE>dh#GTu~XRUbN^MwDaXp^4lLC2Q; z%=~VfdH>PxCn{&G&sm@Io?L$8{P()~%$MJn*3@15|8UFs%1c#dXlYA)rqn(QU=jym z9$4Bc&C4bxS?K`vfPn8rYrn$|5-rJRntJz2oH4eM5prVE5D{ZB5ai`_G4U`dv1BM< z)6!DXbKsOM`QjlWB2vKg&e2);*uAS2^)Dy8-#<%g}Wtc2UIu?ip*d$PPkTZrm-@Bg*!n|NAkJu>wGVDCe;m0 zrxqlwS=*!)!E749{<ixG-?|^Fz$B_)_JY@6^K+Tz=iO6O+o#;!mh-1m=Txo~Te{7o=QB1v zZ`t%)_>`T{sVes=f5S9in`TzrT;gb!sgZY~V1xVNpQi7>$p)5*2bTped!x84XSH8s zNK&r-decu)=Ig&^XTFt3Ek!zCsx^Vi7I7dJf`vY~6e-UyN-54vslb=DK_$t9G)PIp z>(jqRp?BRUsSWQq_Q`OvADXv8RfeOR-y%)(hf47O#x25?|LP{k zZ8|<@2)sS9m)9+T!R_Jaz2^4?ycT=#EHG$eoZ_|A$#wCPMIxQNYaZ$cYD|46(e=CJ zd4R;VkerUc0$+8XT`N#q)Rpu9A$z1;my+?DjT+*HH{RdAADg7bdt14R|Ck%gx5pOe zp6?CtW@M6M##O^fKpep!!0^@)#Dvy&tdRN+t&V~ig{!uM8O6Y`q|qK`6jI#>G!sj` z2Xh|wS`FFE)i5(r>p0?!MJ>>fjb&$s`Wjxm;WH88dDKjbY+@@5)I>yb24`8q=3-_} zWOJXw%tg+sKtr*%N+6LA3suzIhHPjiD~4l3L-4s2SL#PLRF4hWP*7gLWg;t3B?GYW N#KABJ=+*~d9sp?rIfMWJ literal 0 HcmV?d00001 diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 57f2db85d..ad57e9d07 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -15,7 +15,10 @@ # limitations under the License. import os import random +import shutil from unittest import mock +from pathlib import Path +import zipfile import lancedb as ldb from lancedb.db import DBConnection @@ -36,6 +39,8 @@ import pytest import pytest_asyncio from utils import exception_output +TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models" + @pytest.fixture def table(tmp_path) -> ldb.table.LanceTable: @@ -89,6 +94,30 @@ def table(tmp_path) -> ldb.table.LanceTable: return table +@pytest.fixture +def language_model_home(monkeypatch): + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(TEST_LANGUAGE_MODEL_HOME)) + return TEST_LANGUAGE_MODEL_HOME + + +@pytest.fixture +def lindera_ipadic(language_model_home): + model_path = language_model_home / "lindera" / "ipadic" + extracted_model = model_path / "main" + + if extracted_model.exists(): + shutil.rmtree(extracted_model) + + with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref: + zip_ref.extractall(model_path) + + try: + yield + finally: + if extracted_model.exists(): + shutil.rmtree(extracted_model) + + @pytest_asyncio.fixture async def async_table(tmp_path) -> ldb.table.AsyncTable: # Use local random state to avoid affecting other tests @@ -684,6 +713,90 @@ def test_fts_ngram(mem_db: DBConnection): assert set(r["text"] for r in results) == {"lance database", "lance is cool"} +def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home): + data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]}) + table = mem_db.create_table("test_jieba", data=data) + table.create_fts_index( + "text", + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + results = table.search("我们", query_type="fts").limit(10).to_list() + assert [row["text"] for row in results] == ["我们都有光明的前途"] + + +def test_fts_jieba_missing_language_model_note( + mem_db: DBConnection, monkeypatch, tmp_path +): + missing_root = tmp_path / "missing-language-models" + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root)) + table = mem_db.create_table( + "test_missing_jieba_model", + data=pa.table({"text": ["我们都有光明的前途"]}), + ) + + with pytest.raises((ValueError, RuntimeError)) as e: + table.create_fts_index( + "text", + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + output = exception_output(e) + assert "Invalid directory path:" in output + assert "LANCE_LANGUAGE_MODEL_HOME" in output + assert "jieba/default" in output + + +@pytest.mark.asyncio +async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path): + missing_root = tmp_path / "missing-language-models" + monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root)) + db = await ldb.connect_async(tmp_path / "async-db") + table = await db.create_table( + "test_missing_jieba_model_async", + data=pa.table({"text": ["我们都有光明的前途"]}), + ) + + with pytest.raises((ValueError, RuntimeError)) as e: + await table.create_index( + "text", + config=FTS( + base_tokenizer="jieba/default", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ), + ) + + output = exception_output(e) + assert "Invalid directory path:" in output + assert "LANCE_LANGUAGE_MODEL_HOME" in output + assert "jieba/default" in output + + +def test_fts_lindera_tokenizer( + mem_db: DBConnection, language_model_home, lindera_ipadic +): + data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]}) + table = mem_db.create_table("test_lindera", data=data) + table.create_fts_index( + "text", + base_tokenizer="lindera/ipadic", + stem=False, + remove_stop_words=False, + ascii_folding=False, + ) + + results = table.search("成田", query_type="fts").limit(10).to_list() + assert [row["text"] for row in results] == ["成田国際空港"] + + def test_fts_query_to_json(): """Test that FTS query to_json() produces valid JSON strings with exact format.""" diff --git a/rust/lancedb/Cargo.toml b/rust/lancedb/Cargo.toml index 9bb30ba07..4913a420c 100644 --- a/rust/lancedb/Cargo.toml +++ b/rust/lancedb/Cargo.toml @@ -40,7 +40,7 @@ lance-datafusion.workspace = true lance-datagen = { workspace = true } lance-file = { workspace = true } lance-io = { workspace = true } -lance-index = { workspace = true } +lance-index = { workspace = true, features = ["tokenizer-jieba", "tokenizer-lindera"] } lance-table = { workspace = true } lance-linalg = { workspace = true } lance-testing = { workspace = true }