From a8540ad39d5754e7b2cd29fc9766da37838bb665 Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:33:29 +0800 Subject: [PATCH] perf: better jieba cut (#7984) * perf: better jieba cut Signed-off-by: discord9 * fix: also filter pun mark Signed-off-by: discord9 * chore Signed-off-by: discord9 * docs: explain why Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/index/src/fulltext_index/tokenizer.rs | 219 +++++++++++++++++++++- 1 file changed, 215 insertions(+), 4 deletions(-) diff --git a/src/index/src/fulltext_index/tokenizer.rs b/src/index/src/fulltext_index/tokenizer.rs index 3c55cf1009..b6694c555f 100644 --- a/src/index/src/fulltext_index/tokenizer.rs +++ b/src/index/src/fulltext_index/tokenizer.rs @@ -82,7 +82,9 @@ impl Tokenizer for EnglishTokenizer { /// `ChineseTokenizer` tokenizes a Chinese text. /// -/// It uses the Jieba tokenizer to split the text into Chinese words. +/// It uses Jieba search-mode tokenization to improve recall for Chinese fulltext search. +/// Enabling HMM also helps merge some unknown fragments into larger tokens, which can reduce +/// token cardinality versus a fully fragmented output. #[derive(Debug, Default)] pub struct ChineseTokenizer; @@ -91,7 +93,13 @@ impl Tokenizer for ChineseTokenizer { if text.is_ascii() { EnglishTokenizer {}.tokenize(text) } else { - JIEBA.cut(text, false) + // Search-mode tokenization emits finer-grained searchable terms, while HMM helps + // merge some unknown fragments and avoid excessive token fragmentation. + JIEBA + .cut_for_search(text, true) + .into_iter() + .filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_')) + .collect() } } } @@ -174,19 +182,222 @@ mod tests { let tokens = tokenizer.tokenize(text); assert_eq!( tokens, - vec![ + [ "登录", + "手机", "手机号", "18888888888", "的", "动态", "key", - ":", "829889AC8" ] ); } + #[test] + fn test_chinese_tokenizer_aggressive_tokenization_probe() { + let tokenizer = ChineseTokenizer; + let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。"; + + let default_tokens = tokenizer.tokenize(text); + let cut_hmm_false = JIEBA.cut(text, false); + let cut_hmm_true = JIEBA.cut(text, true); + let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false); + let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true); + + assert_eq!( + default_tokens, + [ + "哈基米", + "哦", + "南北", + "绿豆", + "噢", + "马", + "自立", + "曼波", + "登录", + "手机", + "手机号", + "中国", + "农业", + "银行", + "中国农业银行", + "装", + "电视", + "电视台", + "中国", + "中央", + "广播", + "电视", + "电视台", + "不缩", + "压不缩", + "笑", + "不活", + ] + ); + assert_eq!( + cut_hmm_false, + [ + "哈", + "基", + "米", + "哦", + "南北", + "绿豆", + ",", + "噢", + "马", + "自立", + "曼", + "波", + "。", + "登录", + "手机号", + "。", + "中国农业银行", + "。", + "装", + "电视台", + ",", + "中国", + "中央", + "广播", + "电视台", + "。", + "压", + "不", + "缩", + ",", + "笑", + "不", + "活", + "。" + ] + ); + assert_eq!( + cut_hmm_true, + [ + "哈基米", + "哦", + "南北", + "绿豆", + ",", + "噢", + "马", + "自立", + "曼波", + "。", + "登录", + "手机号", + "。", + "中国农业银行", + "。", + "装", + "电视台", + ",", + "中国", + "中央", + "广播", + "电视台", + "。", + "压不缩", + ",", + "笑", + "不活", + "。" + ] + ); + assert_eq!( + cut_for_search_hmm_false, + [ + "哈", + "基", + "米", + "哦", + "南北", + "绿豆", + ",", + "噢", + "马", + "自立", + "曼", + "波", + "。", + "登录", + "手机", + "手机号", + "。", + "中国", + "农业", + "银行", + "中国农业银行", + "。", + "装", + "电视", + "电视台", + ",", + "中国", + "中央", + "广播", + "电视", + "电视台", + "。", + "压", + "不", + "缩", + ",", + "笑", + "不", + "活", + "。" + ] + ); + + assert_eq!( + cut_for_search_hmm_true, + [ + "哈基米", + "哦", + "南北", + "绿豆", + ",", + "噢", + "马", + "自立", + "曼波", + "。", + "登录", + "手机", + "手机号", + "。", + "中国", + "农业", + "银行", + "中国农业银行", + "。", + "装", + "电视", + "电视台", + ",", + "中国", + "中央", + "广播", + "电视", + "电视台", + "。", + "不缩", + "压不缩", + ",", + "笑", + "不活", + "。" + ] + ); + } + #[test] fn test_valid_ascii_token_lookup_table() { // Test all ASCII values in a single loop