fix: faster jieba (#8158)

* fix: faster jieba Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: also update tantivy and the api Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: better bench follow the copilot review Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: apply comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2026-05-28 10:50:39 +00:00 · 2026-05-25 16:51:40 +08:00
parent a25152664b
commit eb264d9adf
5 changed files with 342 additions and 82 deletions
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -26,7 +26,7 @@ fst.workspace = true
 futures.workspace = true
 greptime-proto.workspace = true
 itertools.workspace = true
-jieba-rs = "0.8"
+jieba-rs = "0.10"
 lazy_static.workspace = true
 mockall.workspace = true
 nalgebra.workspace = true
@@ -40,8 +40,8 @@ serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
 store-api.workspace = true
-tantivy = { version = "0.24", features = ["zstd-compression"] }
-tantivy-jieba = "0.16"
+tantivy = { version = "0.26", features = ["zstd-compression"] }
+tantivy-jieba = "0.20"
 tokio.workspace = true
 tokio-util.workspace = true
 usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
--- a/src/index/benches/tokenizer_bench.rs
+++ b/src/index/benches/tokenizer_bench.rs
@@ -12,8 +12,79 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
+use std::collections::HashMap;
+use std::hint::black_box;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use futures::AsyncRead;
+use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
+use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
+use index::fulltext_index::{Analyzer, Config};
+use puffin::puffin_manager::{PuffinWriter, PutOptions};
+
+const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
+    ("short", "登录手机号。中国农业银行。"),
+    (
+        "mixed_log",
+        "2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
+    ),
+    (
+        "product_search",
+        "哈基米哦南北绿豆，噢马自立曼波。装电视台，中国中央广播电视台。压不缩，笑不活。",
+    ),
+    (
+        "long_news",
+        "中国农业银行发布公告称，手机银行登录服务完成升级。多个地区用户反馈查询速度提升，后台监控显示核心链路延迟下降，异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
+    ),
+];
+
+const CHINESE_INDEX_DOCS: &[&str] = &[
+    "登录手机号，中国农业银行手机银行接口返回成功。",
+    "用户登录失败，trace_id=abc_123，dynamic_key=mobile_login。",
+    "中国中央广播电视台发布新的节目预告。",
+    "装电视台的时候遇到压不缩的问题。",
+    "哈基米哦南北绿豆，噢马自立曼波。",
+    "后台监控显示核心链路延迟下降。",
+    "系统保留 request_id 用于排查问题。",
+    "中文全文索引需要兼顾召回率和 token 数量。",
+];
+
+struct NoopPuffinWriter;
+
+#[async_trait]
+impl PuffinWriter for NoopPuffinWriter {
+    async fn put_blob<R>(
+        &mut self,
+        _key: &str,
+        _raw_data: R,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64>
+    where
+        R: AsyncRead + Send,
+    {
+        unreachable!("tantivy fulltext benchmark only writes directory blobs")
+    }
+
+    async fn put_dir(
+        &mut self,
+        _key: &str,
+        _dir: PathBuf,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+
+    fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
+
+    async fn finish(self) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+}

 fn bench_english_tokenizer(c: &mut Criterion) {
    let tokenizer = EnglishTokenizer;
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
    repeat_group.finish();
 }

-criterion_group!(benches, bench_english_tokenizer);
+fn bench_chinese_tokenizer(c: &mut Criterion) {
+    let tokenizer = ChineseTokenizer;
+    let mut group = c.benchmark_group("chinese_tokenizer");
+
+    for (name, text) in CHINESE_TOKENIZER_TEXTS {
+        group.throughput(Throughput::Bytes(text.len() as u64));
+        group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
+            b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
+        });
+    }
+
+    group.finish();
+
+    let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
+    let sample_text = CHINESE_TOKENIZER_TEXTS
+        .iter()
+        .find(|(name, _)| *name == "mixed_log")
+        .map(|(_, text)| *text)
+        .expect("mixed_log sample must exist");
+
+    for repeat_count in [10, 100, 1000] {
+        repeat_group.bench_with_input(
+            BenchmarkId::new("repeated_tokenize", repeat_count),
+            &repeat_count,
+            |b, &repeat_count| {
+                b.iter(|| {
+                    for _ in 0..repeat_count {
+                        black_box(tokenizer.tokenize(black_box(sample_text)));
+                    }
+                })
+            },
+        );
+    }
+
+    repeat_group.finish();
+}
+
+fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create Tokio runtime");
+    let config = Config {
+        analyzer: Analyzer::Chinese,
+        case_sensitive: false,
+    };
+    let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(10));
+
+    for doc_count in [32usize, 256usize] {
+        group.throughput(Throughput::Elements(doc_count as u64));
+        group.bench_with_input(
+            BenchmarkId::new("build_commit", doc_count),
+            &doc_count,
+            |b, &doc_count| {
+                b.iter_batched(
+                    tempfile::tempdir,
+                    |dir| {
+                        let dir = dir.expect("failed to create temp dir");
+                        runtime.block_on(async {
+                            let mut creator =
+                                TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
+                                    .await
+                                    .expect("failed to create tantivy fulltext index");
+                            for idx in 0..doc_count {
+                                let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
+                                creator
+                                    .push_text(black_box(text))
+                                    .await
+                                    .expect("failed to push text");
+                            }
+                            let mut puffin_writer = NoopPuffinWriter;
+                            creator
+                                .finish(
+                                    &mut puffin_writer,
+                                    "tantivy_chinese_fulltext_index",
+                                    PutOptions::default(),
+                                )
+                                .await
+                                .expect("failed to commit tantivy fulltext index");
+                        });
+                        // Return the temp dir so Criterion drops it after timing the routine.
+                        dir
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_english_tokenizer,
+    bench_chinese_tokenizer,
+    bench_tantivy_chinese_fulltext_index
+);
 criterion_main!(benches);
--- a/src/index/src/fulltext_index.rs
+++ b/src/index/src/fulltext_index.rs
@@ -52,7 +52,7 @@ impl Config {
    fn build_tantivy_tokenizer(&self) -> TokenizerManager {
        let mut builder = match self.analyzer {
            Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
-            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
+            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
        };

        if !self.case_sensitive {
--- a/src/index/src/fulltext_index/tokenizer.rs
+++ b/src/index/src/fulltext_index/tokenizer.rs
@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
            let mut tokens = JIEBA
                .cut_for_search(text, true)
                .into_iter()
-                .filter(|s| is_indexable_token(s))
+                .map(|token| token.word)
+                .filter(|token| is_indexable_token(token))
                .collect::<Vec<_>>();

            let english = EnglishTokenizer {};
@@ -336,10 +337,26 @@ mod tests {
        let text = "哈基米哦南北绿豆，噢马自立曼波。登录手机号。中国农业银行。装电视台，中国中央广播电视台。压不缩，笑不活。";

        let default_tokens = tokenizer.tokenize(text);
-        let cut_hmm_false = JIEBA.cut(text, false);
-        let cut_hmm_true = JIEBA.cut(text, true);
-        let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
-        let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
+        let cut_hmm_false = JIEBA
+            .cut(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_hmm_true = JIEBA
+            .cut(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_false = JIEBA
+            .cut_for_search(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_true = JIEBA
+            .cut_for_search(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();

        assert_eq!(
            default_tokens,