mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-14 18:40:39 +00:00
feat(python): support model-backed native FTS tokenizers
This commit is contained in:
469
Cargo.lock
generated
469
Cargo.lock
generated
@@ -365,7 +365,7 @@ version = "57.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"serde_core",
|
||||
"serde_json",
|
||||
]
|
||||
@@ -1186,6 +1186,26 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740"
|
||||
dependencies = [
|
||||
"bincode_derive",
|
||||
"serde",
|
||||
"unty",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bincode_derive"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09"
|
||||
dependencies = [
|
||||
"virtue",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.8.0"
|
||||
@@ -1201,6 +1221,12 @@ version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.0"
|
||||
@@ -1438,6 +1464,15 @@ dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cedarwood"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
@@ -1493,7 +1528,7 @@ checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
|
||||
dependencies = [
|
||||
"parse-zoneinfo",
|
||||
"phf 0.11.3",
|
||||
"phf_codegen",
|
||||
"phf_codegen 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1751,7 +1786,7 @@ version = "0.29.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"crossterm_winapi",
|
||||
"document-features",
|
||||
"parking_lot",
|
||||
@@ -2761,6 +2796,70 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||
|
||||
[[package]]
|
||||
name = "encoding"
|
||||
version = "0.2.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
|
||||
dependencies = [
|
||||
"encoding-index-japanese",
|
||||
"encoding-index-korean",
|
||||
"encoding-index-simpchinese",
|
||||
"encoding-index-singlebyte",
|
||||
"encoding-index-tradchinese",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-japanese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
|
||||
dependencies = [
|
||||
"encoding_index_tests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-korean"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
|
||||
dependencies = [
|
||||
"encoding_index_tests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-simpchinese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
|
||||
dependencies = [
|
||||
"encoding_index_tests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-singlebyte"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
|
||||
dependencies = [
|
||||
"encoding_index_tests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-tradchinese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
|
||||
dependencies = [
|
||||
"encoding_index_tests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_index_tests"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
@@ -2770,6 +2869,15 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs_io"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum-as-inner"
|
||||
version = "0.6.1"
|
||||
@@ -2925,6 +3033,17 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"libredox",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
@@ -2943,7 +3062,7 @@ version = "25.12.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
@@ -3893,7 +4012,7 @@ version = "0.7.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
@@ -3968,6 +4087,28 @@ version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8"
|
||||
|
||||
[[package]]
|
||||
name = "jieba-macros"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a29cfc5dcd898604c6f80363411fa6b6b08e27d1d253d6225b9cb6702ea02fc0"
|
||||
dependencies = [
|
||||
"phf_codegen 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jieba-rs"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3245d6e9d1d5facbd6a23848d6b67e3439738ccbb4fa5a3d65da315ba1a910a2"
|
||||
dependencies = [
|
||||
"cedarwood",
|
||||
"jieba-macros",
|
||||
"phf 0.13.1",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jiff"
|
||||
version = "0.2.23"
|
||||
@@ -4064,6 +4205,15 @@ dependencies = [
|
||||
"simple_asn1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kanaria"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "6.0.0-beta.1"
|
||||
@@ -4285,7 +4435,7 @@ dependencies = [
|
||||
"prost-types",
|
||||
"rand 0.9.2",
|
||||
"snafu 0.9.0",
|
||||
"strum",
|
||||
"strum 0.26.3",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"xxhash-rust",
|
||||
@@ -4355,6 +4505,7 @@ dependencies = [
|
||||
"futures",
|
||||
"half",
|
||||
"itertools 0.13.0",
|
||||
"jieba-rs",
|
||||
"jsonb",
|
||||
"lance-arrow",
|
||||
"lance-core",
|
||||
@@ -4569,6 +4720,8 @@ name = "lance-tokenizer"
|
||||
version = "6.0.0-beta.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v6.0.0-beta.1#c7a7d3a0e944646e793d297d4a2e2cf7e4fb28a3"
|
||||
dependencies = [
|
||||
"jieba-rs",
|
||||
"lindera",
|
||||
"rust-stemmers",
|
||||
"serde",
|
||||
"unicode-normalization",
|
||||
@@ -4803,7 +4956,134 @@ version = "0.1.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"libc",
|
||||
"plain",
|
||||
"redox_syscall 0.7.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50aba4ef41052280722f2120f65606b9218e8718032a3c752b953c4d8091f02e"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"csv",
|
||||
"kanaria",
|
||||
"lindera-cc-cedict",
|
||||
"lindera-dictionary",
|
||||
"lindera-ipadic",
|
||||
"lindera-ipadic-neologd",
|
||||
"lindera-ko-dic",
|
||||
"lindera-unidic",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml",
|
||||
"strum 0.27.2",
|
||||
"strum_macros 0.27.2",
|
||||
"unicode-blocks",
|
||||
"unicode-normalization",
|
||||
"unicode-segmentation",
|
||||
"yada",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-cc-cedict"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d77e7a0830fd60f23828ad914439997288c1d2cdd9e269be67f967c27b56350"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"lindera-dictionary",
|
||||
"once_cell",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-dictionary"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "489cc70922782af3fd397c0e130846caefe1c15b27c2211aac8f88a9f4590aaf"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"csv",
|
||||
"derive_builder",
|
||||
"encoding",
|
||||
"encoding_rs",
|
||||
"encoding_rs_io",
|
||||
"flate2",
|
||||
"glob",
|
||||
"log",
|
||||
"md5",
|
||||
"memmap2 0.9.10",
|
||||
"once_cell",
|
||||
"rand 0.9.2",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"tar",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"yada",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ipadic"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78870521431dfaf0f94ddd3484fa08367e9d354fc8c708572f2f00007225ddfa"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"lindera-dictionary",
|
||||
"once_cell",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ipadic-neologd"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "abcb3dc3056e5c683e12c2c5e8d40076f7ecfd7bd46f5fc0e4ae9e58152b5d85"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"lindera-dictionary",
|
||||
"once_cell",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ko-dic"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e99316158bab14f0256d912055521ca784f76c63e7460db8a74775c5dc1f8bc2"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"lindera-dictionary",
|
||||
"once_cell",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lindera-unidic"
|
||||
version = "0.44.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52672945166c14276bbba25e4ec79d7e126db1b503c0a6aa07ffc0141ae15cfa"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"lindera-dictionary",
|
||||
"once_cell",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4960,6 +5240,12 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "md5"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae960838283323069879657ca3de837e9f7bbb4c7bf6ea7f1b290d5e9476d2e0"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.8.0"
|
||||
@@ -5119,7 +5405,7 @@ version = "3.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6944d0bf100571cd6e1a98a316cdca262deb6fccf8d93f5ae1502ca3fc88bd3"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"ctor",
|
||||
"futures",
|
||||
"napi-build",
|
||||
@@ -5407,7 +5693,7 @@ version = "6.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"onig_sys",
|
||||
@@ -5525,7 +5811,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"redox_syscall 0.5.18",
|
||||
"smallvec",
|
||||
"windows-link",
|
||||
]
|
||||
@@ -5634,16 +5920,36 @@ dependencies = [
|
||||
"phf_shared 0.12.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
|
||||
dependencies = [
|
||||
"phf_shared 0.13.1",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
|
||||
dependencies = [
|
||||
"phf_generator 0.13.1",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.3"
|
||||
@@ -5654,6 +5960,16 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.3"
|
||||
@@ -5672,6 +5988,15 @@ dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.11"
|
||||
@@ -5758,6 +6083,12 @@ version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||
|
||||
[[package]]
|
||||
name = "plain"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
|
||||
|
||||
[[package]]
|
||||
name = "planus"
|
||||
version = "0.3.1"
|
||||
@@ -5855,7 +6186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "465f70d3e96b6d0b1a43c358ba451286b8c8bd56696feff020d65702aa33e35c"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"bytemuck",
|
||||
"chrono",
|
||||
"chrono-tz 0.8.6",
|
||||
@@ -5929,7 +6260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89b2632b1af668e2058d5f8f916d8fbde3cac63d03ae29a705f598e41dcfeb7f"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"glob",
|
||||
"once_cell",
|
||||
"polars-arrow",
|
||||
@@ -6582,7 +6913,7 @@ version = "11.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6654,7 +6985,16 @@ version = "0.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6950,7 +7290,7 @@ version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
@@ -7175,7 +7515,7 @@ version = "3.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"core-foundation 0.10.1",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
@@ -7321,6 +7661,19 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_yaml"
|
||||
version = "0.9.34+deprecated"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
|
||||
dependencies = [
|
||||
"indexmap 2.13.0",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
"unsafe-libyaml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.6"
|
||||
@@ -7661,6 +8014,15 @@ dependencies = [
|
||||
"strum_macros 0.26.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
|
||||
dependencies = [
|
||||
"strum_macros 0.27.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.25.3"
|
||||
@@ -7687,6 +8049,18 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
@@ -7741,7 +8115,7 @@ version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"byteorder",
|
||||
"enum-as-inner",
|
||||
"libc",
|
||||
@@ -7769,7 +8143,7 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"core-foundation 0.9.4",
|
||||
"system-configuration-sys",
|
||||
]
|
||||
@@ -7796,6 +8170,17 @@ version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "target-features"
|
||||
version = "0.1.6"
|
||||
@@ -8122,7 +8507,7 @@ version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"bytes",
|
||||
"http 1.4.0",
|
||||
"http-body 1.0.1",
|
||||
@@ -8140,7 +8525,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
|
||||
dependencies = [
|
||||
"async-compression",
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
@@ -8263,6 +8648,12 @@ version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-blocks"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
@@ -8326,6 +8717,12 @@ version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
||||
|
||||
[[package]]
|
||||
name = "unsafe-libyaml"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.7.1"
|
||||
@@ -8338,6 +8735,12 @@ version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "unty"
|
||||
version = "0.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae"
|
||||
|
||||
[[package]]
|
||||
name = "ureq"
|
||||
version = "2.12.1"
|
||||
@@ -8417,6 +8820,12 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "virtue"
|
||||
version = "0.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1"
|
||||
|
||||
[[package]]
|
||||
name = "vsimd"
|
||||
version = "0.8.0"
|
||||
@@ -8566,7 +8975,7 @@ version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"hashbrown 0.15.5",
|
||||
"indexmap 2.13.0",
|
||||
"semver",
|
||||
@@ -8962,7 +9371,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitflags",
|
||||
"bitflags 2.11.0",
|
||||
"indexmap 2.13.0",
|
||||
"log",
|
||||
"serde",
|
||||
@@ -9007,6 +9416,16 @@ dependencies = [
|
||||
"tap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rustix",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xmlparser"
|
||||
version = "0.13.6"
|
||||
@@ -9019,6 +9438,12 @@ version = "0.8.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
|
||||
|
||||
[[package]]
|
||||
name = "yada"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd"
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.8.1"
|
||||
|
||||
@@ -94,11 +94,11 @@ of raw SQL strings with [where][lancedb.query.LanceQueryBuilder.where] and
|
||||
|
||||
## Full text search
|
||||
|
||||
::: lancedb.fts.create_index
|
||||
Use [lancedb.table.Table.create_fts_index][] for the synchronous API or
|
||||
[lancedb.table.AsyncTable.create_index][] with [lancedb.index.FTS][] for the
|
||||
asynchronous API.
|
||||
|
||||
::: lancedb.fts.populate_index
|
||||
|
||||
::: lancedb.fts.search_index
|
||||
::: lancedb.index.FTS
|
||||
|
||||
## Utilities
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Literal, Optional
|
||||
from ._lancedb import (
|
||||
IndexConfig,
|
||||
)
|
||||
from .types import BaseTokenizerType
|
||||
|
||||
lang_mapping = {
|
||||
"ar": "Arabic",
|
||||
@@ -111,8 +112,12 @@ class FTS:
|
||||
- "simple": Splits text by whitespace and punctuation.
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-gram tokenizer for substring-style matching.
|
||||
- "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
- "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not the
|
||||
primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -127,10 +132,15 @@ class FTS:
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
"""
|
||||
|
||||
with_position: bool = False
|
||||
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
|
||||
base_tokenizer: BaseTokenizerType = "simple"
|
||||
language: str = "English"
|
||||
max_token_length: Optional[int] = 40
|
||||
lower_case: bool = True
|
||||
|
||||
@@ -39,6 +39,7 @@ from lancedb.table import _normalize_progress
|
||||
|
||||
from ..query import LanceVectorQueryBuilder, LanceQueryBuilder, LanceTakeQueryBuilder
|
||||
from ..table import AsyncTable, IndexStatistics, Query, Table, Tags
|
||||
from ..types import BaseTokenizerType
|
||||
|
||||
|
||||
class RemoteTable(Table):
|
||||
@@ -167,7 +168,7 @@ class RemoteTable(Table):
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
|
||||
@@ -86,6 +86,52 @@ from .util import (
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
_MODEL_BACKED_TOKENIZER_PREFIXES = ("jieba", "lindera")
|
||||
_MODEL_BACKED_TOKENIZER_ERRORS = (
|
||||
"unknown base tokenizer",
|
||||
"Invalid directory path:",
|
||||
"Failed to load Jieba",
|
||||
"Failed to load tokenizer config",
|
||||
"Failed to initialize default tokenizer",
|
||||
)
|
||||
|
||||
|
||||
def _add_unique_note(exception: BaseException, note: str) -> None:
|
||||
existing_notes = getattr(exception, "__notes__", ()) or ()
|
||||
if note not in existing_notes:
|
||||
add_note(exception, note)
|
||||
|
||||
|
||||
def _is_model_backed_tokenizer(base_tokenizer: str) -> bool:
|
||||
return any(
|
||||
base_tokenizer == prefix or base_tokenizer.startswith(f"{prefix}/")
|
||||
for prefix in _MODEL_BACKED_TOKENIZER_PREFIXES
|
||||
)
|
||||
|
||||
|
||||
def _maybe_add_fts_error_note(
|
||||
exception: BaseException, *, base_tokenizer: str, language: Optional[str] = None
|
||||
) -> None:
|
||||
message = str(exception)
|
||||
if language is not None and "not support the requested language" in message:
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
_add_unique_note(exception, f"Supported languages: {supported_langs}")
|
||||
return
|
||||
|
||||
if not _is_model_backed_tokenizer(base_tokenizer):
|
||||
return
|
||||
|
||||
if not any(marker in message for marker in _MODEL_BACKED_TOKENIZER_ERRORS):
|
||||
return
|
||||
|
||||
_add_unique_note(
|
||||
exception,
|
||||
"Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
|
||||
"require tokenizer models under LANCE_LANGUAGE_MODEL_HOME. Expected "
|
||||
"layouts include '$LANCE_LANGUAGE_MODEL_HOME/jieba/default/...' and "
|
||||
"'$LANCE_LANGUAGE_MODEL_HOME/lindera/ipadic/...'.",
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .db import LanceDBConnection
|
||||
@@ -958,7 +1004,8 @@ class Table(ABC):
|
||||
tokenizer_name: str, default "default"
|
||||
A compatibility alias for native tokenizer configs. Can be "raw",
|
||||
"default" or the 2 letter language code followed by "_stem". So
|
||||
for english it would be "en_stem".
|
||||
for english it would be "en_stem". Prefer ``base_tokenizer`` for
|
||||
new code.
|
||||
use_tantivy: bool, default False
|
||||
Deprecated legacy Tantivy parameter. Setting this to True raises an
|
||||
error.
|
||||
@@ -972,8 +1019,11 @@ class Table(ABC):
|
||||
- "whitespace": Split text by whitespace, but not punctuation.
|
||||
- "raw": No tokenization. The entire text is treated as a single token.
|
||||
- "ngram": N-Gram tokenizer.
|
||||
- "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
- "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
language : str, default "English"
|
||||
The language to use for tokenization.
|
||||
The language to use for stemming and stop-word removal. This is not
|
||||
the primary way to enable CJK tokenization.
|
||||
max_token_length : int, default 40
|
||||
The maximum token length to index. Tokens longer than this length will be
|
||||
ignored.
|
||||
@@ -999,6 +1049,11 @@ class Table(ABC):
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
The name of the index. If not provided, a default name will be generated.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
|
||||
require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -2462,14 +2517,22 @@ class LanceTable(Table):
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
try:
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
field_names,
|
||||
replace=replace,
|
||||
config=config,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||
@@ -3865,11 +3928,13 @@ class AsyncTable:
|
||||
name=name,
|
||||
train=train,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "not support the requested language" in str(e):
|
||||
supported_langs = ", ".join(lang_mapping.values())
|
||||
help_msg = f"Supported languages: {supported_langs}"
|
||||
add_note(e, help_msg)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
if isinstance(config, FTS):
|
||||
_maybe_add_fts_error_note(
|
||||
e,
|
||||
base_tokenizer=config.base_tokenizer,
|
||||
language=config.language,
|
||||
)
|
||||
raise e
|
||||
|
||||
async def drop_index(self, name: str) -> None:
|
||||
|
||||
@@ -40,4 +40,5 @@ IndexType = Literal[
|
||||
]
|
||||
|
||||
# Tokenizer literals
|
||||
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BuiltinTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]
|
||||
BaseTokenizerType = BuiltinTokenizerType | str
|
||||
|
||||
8
python/python/tests/models/jieba/default/dict.txt
Normal file
8
python/python/tests/models/jieba/default/dict.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
我们 98740 r
|
||||
都 202780 d
|
||||
有 423765 v
|
||||
光明 1219 n
|
||||
的 318825 uj
|
||||
前途 1263 n
|
||||
前 62779 f
|
||||
途 857 n
|
||||
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
4
python/python/tests/models/lindera/ipadic/config.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
segmenter:
|
||||
mode: "normal"
|
||||
dictionary:
|
||||
path: "./python/tests/models/lindera/ipadic/main"
|
||||
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
BIN
python/python/tests/models/lindera/ipadic/main.zip
Normal file
Binary file not shown.
@@ -15,7 +15,10 @@
|
||||
# limitations under the License.
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
import lancedb as ldb
|
||||
from lancedb.db import DBConnection
|
||||
@@ -36,6 +39,8 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from utils import exception_output
|
||||
|
||||
TEST_LANGUAGE_MODEL_HOME = Path(__file__).parent / "models"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table(tmp_path) -> ldb.table.LanceTable:
|
||||
@@ -89,6 +94,30 @@ def table(tmp_path) -> ldb.table.LanceTable:
|
||||
return table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def language_model_home(monkeypatch):
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(TEST_LANGUAGE_MODEL_HOME))
|
||||
return TEST_LANGUAGE_MODEL_HOME
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lindera_ipadic(language_model_home):
|
||||
model_path = language_model_home / "lindera" / "ipadic"
|
||||
extracted_model = model_path / "main"
|
||||
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
with zipfile.ZipFile(model_path / "main.zip", "r") as zip_ref:
|
||||
zip_ref.extractall(model_path)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if extracted_model.exists():
|
||||
shutil.rmtree(extracted_model)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_table(tmp_path) -> ldb.table.AsyncTable:
|
||||
# Use local random state to avoid affecting other tests
|
||||
@@ -684,6 +713,90 @@ def test_fts_ngram(mem_db: DBConnection):
|
||||
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
|
||||
|
||||
|
||||
def test_fts_jieba_tokenizer(mem_db: DBConnection, language_model_home):
|
||||
data = pa.table({"text": ["我们都有光明的前途", "光明的前途"]})
|
||||
table = mem_db.create_table("test_jieba", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("我们", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["我们都有光明的前途"]
|
||||
|
||||
|
||||
def test_fts_jieba_missing_language_model_note(
|
||||
mem_db: DBConnection, monkeypatch, tmp_path
|
||||
):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
table = mem_db.create_table(
|
||||
"test_missing_jieba_model",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_jieba_missing_language_model_note_async(monkeypatch, tmp_path):
|
||||
missing_root = tmp_path / "missing-language-models"
|
||||
monkeypatch.setenv("LANCE_LANGUAGE_MODEL_HOME", str(missing_root))
|
||||
db = await ldb.connect_async(tmp_path / "async-db")
|
||||
table = await db.create_table(
|
||||
"test_missing_jieba_model_async",
|
||||
data=pa.table({"text": ["我们都有光明的前途"]}),
|
||||
)
|
||||
|
||||
with pytest.raises((ValueError, RuntimeError)) as e:
|
||||
await table.create_index(
|
||||
"text",
|
||||
config=FTS(
|
||||
base_tokenizer="jieba/default",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
),
|
||||
)
|
||||
|
||||
output = exception_output(e)
|
||||
assert "Invalid directory path:" in output
|
||||
assert "LANCE_LANGUAGE_MODEL_HOME" in output
|
||||
assert "jieba/default" in output
|
||||
|
||||
|
||||
def test_fts_lindera_tokenizer(
|
||||
mem_db: DBConnection, language_model_home, lindera_ipadic
|
||||
):
|
||||
data = pa.table({"text": ["成田国際空港", "東京国際空港", "羽田空港"]})
|
||||
table = mem_db.create_table("test_lindera", data=data)
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
base_tokenizer="lindera/ipadic",
|
||||
stem=False,
|
||||
remove_stop_words=False,
|
||||
ascii_folding=False,
|
||||
)
|
||||
|
||||
results = table.search("成田", query_type="fts").limit(10).to_list()
|
||||
assert [row["text"] for row in results] == ["成田国際空港"]
|
||||
|
||||
|
||||
def test_fts_query_to_json():
|
||||
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ lance-datafusion.workspace = true
|
||||
lance-datagen = { workspace = true }
|
||||
lance-file = { workspace = true }
|
||||
lance-io = { workspace = true }
|
||||
lance-index = { workspace = true }
|
||||
lance-index = { workspace = true, features = ["tokenizer-jieba", "tokenizer-lindera"] }
|
||||
lance-table = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
lance-testing = { workspace = true }
|
||||
|
||||
Reference in New Issue
Block a user