From eb264d9adfd4f8a2d0ec37758c3c8660bc3414b2 Mon Sep 17 00:00:00 2001 From: yihong Date: Mon, 25 May 2026 16:51:40 +0800 Subject: [PATCH] fix: faster jieba (#8158) * fix: faster jieba Signed-off-by: yihong0618 * fix: also update tantivy and the api Signed-off-by: yihong0618 * fix: better bench follow the copilot review Signed-off-by: yihong0618 * fix: apply comments Signed-off-by: yihong0618 --------- Signed-off-by: yihong0618 --- Cargo.lock | 213 +++++++++++++++------- src/index/Cargo.toml | 6 +- src/index/benches/tokenizer_bench.rs | 176 +++++++++++++++++- src/index/src/fulltext_index.rs | 2 +- src/index/src/fulltext_index/tokenizer.rs | 27 ++- 5 files changed, 342 insertions(+), 82 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45b726ab82..a0494b4266 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1321,9 +1321,9 @@ dependencies = [ [[package]] name = "bitpacking" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" dependencies = [ "crunchy", ] @@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb" dependencies = [ "parse-zoneinfo", - "phf_codegen", + "phf_codegen 0.12.1", "phf_shared 0.12.1", "uncased", ] @@ -4380,6 +4380,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "datasketches" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" + [[package]] name = "datatypes" version = "1.1.0" @@ -5486,12 +5492,12 @@ dependencies = [ [[package]] name = "fs4" -version = "0.8.4" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4" dependencies = [ - "rustix 0.38.44", - "windows-sys 0.52.0", + "rustix 1.0.7", + "windows-sys 0.59.0", ] [[package]] @@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" [[package]] name = "include-flate" -version = "0.3.0" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347" dependencies = [ "include-flate-codegen", - "lazy_static", - "libflate", + "include-flate-compress", ] [[package]] name = "include-flate-codegen" -version = "0.2.0" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969" dependencies = [ - "libflate", + "include-flate-compress", + "proc-macro-error2", "proc-macro2", "quote", "syn 2.0.117", ] +[[package]] +name = "include-flate-compress" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff" +dependencies = [ + "libflate", + "zstd", +] + [[package]] name = "include_dir" version = "0.7.4" @@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jieba-macros" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3" +checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a" dependencies = [ - "phf_codegen", + "phf_codegen 0.13.1", ] [[package]] name = "jieba-rs" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a" +checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f" dependencies = [ "cedarwood", - "fxhash", "include-flate", "jieba-macros", - "phf 0.12.1", + "phf 0.13.1", "regex", + "rustc-hash 2.1.1", ] [[package]] @@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libflate" -version = "2.1.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df" dependencies = [ "adler32", - "core2", "crc32fast", "dary_heap", "libflate_lz77", + "no_std_io2", ] [[package]] name = "libflate_lz77" -version = "2.1.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd" dependencies = [ - "core2", - "hashbrown 0.14.5", + "hashbrown 0.16.1", + "no_std_io2", "rle-decode-fast", ] @@ -7816,6 +7832,15 @@ dependencies = [ "hashbrown 0.15.4", ] +[[package]] +name = "lru" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -8434,7 +8459,7 @@ dependencies = [ "flate2", "io-enum", "libc", - "lru", + "lru 0.12.5", "mysql_common 0.34.1", "named_pipe", "pem", @@ -8497,7 +8522,7 @@ dependencies = [ "futures-sink", "futures-util", "keyed_priority_queue", - "lru", + "lru 0.12.5", "mysql_common 0.34.1", "pem", "percent-encoding", @@ -8695,6 +8720,15 @@ dependencies = [ "libc", ] +[[package]] +name = "no_std_io2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003" +dependencies = [ + "memchr", +] + [[package]] name = "nohash" version = "0.2.0" @@ -9635,6 +9669,15 @@ dependencies = [ "serde", ] +[[package]] +name = "ordered-float" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-multimap" version = "0.4.3" @@ -10122,6 +10165,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ "phf_macros", "phf_shared 0.12.1", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared 0.13.1", "serde", ] @@ -10131,10 +10183,20 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61" dependencies = [ - "phf_generator", + "phf_generator 0.12.1", "phf_shared 0.12.1", ] +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + [[package]] name = "phf_generator" version = "0.12.1" @@ -10145,13 +10207,23 @@ dependencies = [ "phf_shared 0.12.1", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_macros" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368" dependencies = [ - "phf_generator", + "phf_generator 0.12.1", "phf_shared 0.12.1", "proc-macro2", "quote", @@ -10178,6 +10250,15 @@ dependencies = [ "uncased", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -11415,16 +11496,6 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" -[[package]] -name = "rand_distr" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" -dependencies = [ - "num-traits", - "rand 0.8.5", -] - [[package]] name = "rand_xorshift" version = "0.4.0" @@ -12961,9 +13032,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "sketches-ddsketch" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea" dependencies = [ "serde", ] @@ -13864,9 +13935,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" -version = "0.24.2" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43" +checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07" dependencies = [ "aho-corasick", "arc-swap", @@ -13877,17 +13948,17 @@ dependencies = [ "census", "crc32fast", "crossbeam-channel", + "datasketches", "downcast-rs", "fastdivide", "fnv", "fs4", "htmlescape", - "hyperloglogplus", "itertools 0.14.0", "levenshtein_automata", "log", - "lru", - "lz4_flex 0.11.6", + "lru 0.16.4", + "lz4_flex 0.13.1", "measure_time", "memmap2", "once_cell", @@ -13910,6 +13981,7 @@ dependencies = [ "tempfile", "thiserror 2.0.17", "time", + "typetag", "uuid", "winapi", "zstd", @@ -13917,18 +13989,18 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494" +checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4" dependencies = [ "bitpacking", ] [[package]] name = "tantivy-columnar" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344" +checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc" dependencies = [ "downcast-rs", "fastdivide", @@ -13942,9 +14014,9 @@ dependencies = [ [[package]] name = "tantivy-common" -version = "0.9.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f" +checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5" dependencies = [ "async-trait", "byteorder", @@ -13966,9 +14038,9 @@ dependencies = [ [[package]] name = "tantivy-jieba" -version = "0.16.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363" +checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a" dependencies = [ "jieba-rs", "lazy_static", @@ -13977,20 +14049,22 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.24.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a" +checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82" dependencies = [ + "fnv", "nom 7.1.3", + "ordered-float 5.3.0", "serde", "serde_json", ] [[package]] name = "tantivy-sstable" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416" +checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606" dependencies = [ "futures-util", "itertools 0.14.0", @@ -14002,20 +14076,19 @@ dependencies = [ [[package]] name = "tantivy-stacker" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1" +checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951" dependencies = [ "murmurhash32", - "rand_distr", "tantivy-common", ] [[package]] name = "tantivy-tokenizer-api" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d" +checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98" dependencies = [ "serde", ] @@ -15018,9 +15091,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typetag" -version = "0.2.20" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f" +checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c" dependencies = [ "erased-serde", "inventory", @@ -15031,9 +15104,9 @@ dependencies = [ [[package]] name = "typetag-impl" -version = "0.2.20" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952" +checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5" dependencies = [ "proc-macro2", "quote", diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml index 3b78f7d22f..167c1c0df1 100644 --- a/src/index/Cargo.toml +++ b/src/index/Cargo.toml @@ -26,7 +26,7 @@ fst.workspace = true futures.workspace = true greptime-proto.workspace = true itertools.workspace = true -jieba-rs = "0.8" +jieba-rs = "0.10" lazy_static.workspace = true mockall.workspace = true nalgebra.workspace = true @@ -40,8 +40,8 @@ serde.workspace = true serde_json.workspace = true snafu.workspace = true store-api.workspace = true -tantivy = { version = "0.24", features = ["zstd-compression"] } -tantivy-jieba = "0.16" +tantivy = { version = "0.26", features = ["zstd-compression"] } +tantivy-jieba = "0.20" tokio.workspace = true tokio-util.workspace = true usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true } diff --git a/src/index/benches/tokenizer_bench.rs b/src/index/benches/tokenizer_bench.rs index e365c884b2..f376fe57d7 100644 --- a/src/index/benches/tokenizer_bench.rs +++ b/src/index/benches/tokenizer_bench.rs @@ -12,8 +12,79 @@ // See the License for the specific language governing permissions and // limitations under the License. -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer}; +use std::collections::HashMap; +use std::hint::black_box; +use std::path::PathBuf; +use std::time::Duration; + +use async_trait::async_trait; +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use futures::AsyncRead; +use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator}; +use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer}; +use index::fulltext_index::{Analyzer, Config}; +use puffin::puffin_manager::{PuffinWriter, PutOptions}; + +const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[ + ("short", "登录手机号。中国农业银行。"), + ( + "mixed_log", + "2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。", + ), + ( + "product_search", + "哈基米哦南北绿豆,噢马自立曼波。装电视台,中国中央广播电视台。压不缩,笑不活。", + ), + ( + "long_news", + "中国农业银行发布公告称,手机银行登录服务完成升级。多个地区用户反馈查询速度提升,后台监控显示核心链路延迟下降,异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。", + ), +]; + +const CHINESE_INDEX_DOCS: &[&str] = &[ + "登录手机号,中国农业银行手机银行接口返回成功。", + "用户登录失败,trace_id=abc_123,dynamic_key=mobile_login。", + "中国中央广播电视台发布新的节目预告。", + "装电视台的时候遇到压不缩的问题。", + "哈基米哦南北绿豆,噢马自立曼波。", + "后台监控显示核心链路延迟下降。", + "系统保留 request_id 用于排查问题。", + "中文全文索引需要兼顾召回率和 token 数量。", +]; + +struct NoopPuffinWriter; + +#[async_trait] +impl PuffinWriter for NoopPuffinWriter { + async fn put_blob( + &mut self, + _key: &str, + _raw_data: R, + _options: PutOptions, + _properties: HashMap, + ) -> puffin::error::Result + where + R: AsyncRead + Send, + { + unreachable!("tantivy fulltext benchmark only writes directory blobs") + } + + async fn put_dir( + &mut self, + _key: &str, + _dir: PathBuf, + _options: PutOptions, + _properties: HashMap, + ) -> puffin::error::Result { + Ok(0) + } + + fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {} + + async fn finish(self) -> puffin::error::Result { + Ok(0) + } +} fn bench_english_tokenizer(c: &mut Criterion) { let tokenizer = EnglishTokenizer; @@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) { repeat_group.finish(); } -criterion_group!(benches, bench_english_tokenizer); +fn bench_chinese_tokenizer(c: &mut Criterion) { + let tokenizer = ChineseTokenizer; + let mut group = c.benchmark_group("chinese_tokenizer"); + + for (name, text) in CHINESE_TOKENIZER_TEXTS { + group.throughput(Throughput::Bytes(text.len() as u64)); + group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| { + b.iter(|| black_box(tokenizer.tokenize(black_box(text)))) + }); + } + + group.finish(); + + let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated"); + let sample_text = CHINESE_TOKENIZER_TEXTS + .iter() + .find(|(name, _)| *name == "mixed_log") + .map(|(_, text)| *text) + .expect("mixed_log sample must exist"); + + for repeat_count in [10, 100, 1000] { + repeat_group.bench_with_input( + BenchmarkId::new("repeated_tokenize", repeat_count), + &repeat_count, + |b, &repeat_count| { + b.iter(|| { + for _ in 0..repeat_count { + black_box(tokenizer.tokenize(black_box(sample_text))); + } + }) + }, + ); + } + + repeat_group.finish(); +} + +fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create Tokio runtime"); + let config = Config { + analyzer: Analyzer::Chinese, + case_sensitive: false, + }; + let mut group = c.benchmark_group("tantivy_chinese_fulltext_index"); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + for doc_count in [32usize, 256usize] { + group.throughput(Throughput::Elements(doc_count as u64)); + group.bench_with_input( + BenchmarkId::new("build_commit", doc_count), + &doc_count, + |b, &doc_count| { + b.iter_batched( + tempfile::tempdir, + |dir| { + let dir = dir.expect("failed to create temp dir"); + runtime.block_on(async { + let mut creator = + TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20) + .await + .expect("failed to create tantivy fulltext index"); + for idx in 0..doc_count { + let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()]; + creator + .push_text(black_box(text)) + .await + .expect("failed to push text"); + } + let mut puffin_writer = NoopPuffinWriter; + creator + .finish( + &mut puffin_writer, + "tantivy_chinese_fulltext_index", + PutOptions::default(), + ) + .await + .expect("failed to commit tantivy fulltext index"); + }); + // Return the temp dir so Criterion drops it after timing the routine. + dir + }, + BatchSize::SmallInput, + ) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_english_tokenizer, + bench_chinese_tokenizer, + bench_tantivy_chinese_fulltext_index +); criterion_main!(benches); diff --git a/src/index/src/fulltext_index.rs b/src/index/src/fulltext_index.rs index 8de28c0490..06a36f65a8 100644 --- a/src/index/src/fulltext_index.rs +++ b/src/index/src/fulltext_index.rs @@ -52,7 +52,7 @@ impl Config { fn build_tantivy_tokenizer(&self) -> TokenizerManager { let mut builder = match self.analyzer { Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(), + Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(), }; if !self.case_sensitive { diff --git a/src/index/src/fulltext_index/tokenizer.rs b/src/index/src/fulltext_index/tokenizer.rs index 919c497317..3afc826e6f 100644 --- a/src/index/src/fulltext_index/tokenizer.rs +++ b/src/index/src/fulltext_index/tokenizer.rs @@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer { let mut tokens = JIEBA .cut_for_search(text, true) .into_iter() - .filter(|s| is_indexable_token(s)) + .map(|token| token.word) + .filter(|token| is_indexable_token(token)) .collect::>(); let english = EnglishTokenizer {}; @@ -336,10 +337,26 @@ mod tests { let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。"; let default_tokens = tokenizer.tokenize(text); - let cut_hmm_false = JIEBA.cut(text, false); - let cut_hmm_true = JIEBA.cut(text, true); - let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false); - let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true); + let cut_hmm_false = JIEBA + .cut(text, false) + .into_iter() + .map(|token| token.word) + .collect::>(); + let cut_hmm_true = JIEBA + .cut(text, true) + .into_iter() + .map(|token| token.word) + .collect::>(); + let cut_for_search_hmm_false = JIEBA + .cut_for_search(text, false) + .into_iter() + .map(|token| token.word) + .collect::>(); + let cut_for_search_hmm_true = JIEBA + .cut_for_search(text, true) + .into_iter() + .map(|token| token.word) + .collect::>(); assert_eq!( default_tokens,